In [None]:
import pandas as pd
import numpy as np
import os
import gc

In [None]:
def read_articles():
    df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
    return df
def read_customers():
    df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
    return df
def read_transactions():
    df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv",
                                   parse_dates=["t_dat"])
    return df

## Reducing the Data Types from higher order to lower order

In [None]:
def reduce_dtype(df):
    list_int_type = df.select_dtypes(include=["int64"])
    for col in list_int_type:
        if df[col].max()>32767:
            df[col] = df[col].astype("int64")
        elif df[col].max()>128:
            df[col] = df[col].astype("int16")
        else:
            df[col] = df[col].astype("int8")
            
    list_float_type = df.select_dtypes(include=["float64"])
    for col in list_float_type:
        if df[col].max()>np.finfo(np.float16).max:
            df[col] = df[col].astype("float64")
        else:
            df[col] = df[col].astype("float16")       
    
    return df

| Dataset | Memory Utilization |
| --- | --- |
| Articles |  20+ MB |
| Customer |  73+ MB |
| Transactions |  1.2+ GB |

In [None]:
%%time
articles_df = read_articles()
articles_df = reduce_dtype(articles_df)
print(articles_df.info(memory_usage=True))
del articles_df

In [None]:
customers_df = read_customers()
customers_df = reduce_dtype(customers_df)
print(customers_df.info())
del customers_df

In [None]:
transactions_df = read_transactions()
transactions_df = reduce_dtype(transactions_df)
print(transactions_df.info())
del transactions_df

## After changing the data type

| Dataset | Memory Utilization |
| --- | --- |
| Articles |  14.8+ MB |
| Customer |  49.7+ MB |
| Transactions |  818+ MB |

## Removing the extra columns as per the Intuition

In [None]:
def missing_value(data):
    mis_data = data.isnull().sum().sort_values(ascending=False)
    per_data = ((data.isnull().sum()/data.isnull().count())*100).sort_values(ascending=False)
    #nunique_data = mis_data.nunique_data()
    ret_data = pd.concat([per_data,mis_data],axis=1,keys=["Percentage Missing","Missing Count"])
    return ret_data[ret_data["Missing Count"]>0] if ret_data[ret_data["Missing Count"]>0].shape[0]>0 else "No missing Value"

def unique_data(data):
    tot = data.count()
    nunique = data.nunique().sort_values()
    ret_data =  pd.concat([tot,nunique],keys=["Total","Unique Values"],axis=1)
    return ret_data.sort_values(by="Unique Values")

In [None]:
df = read_articles()
print(missing_value(df))
print(unique_data(df))

In [None]:

df.drop(["product_type_name","product_group_name","graphical_appearance_name","colour_group_name",
         "perceived_colour_value_name","perceived_colour_master_name","department_name",
         "index_name","index_group_name","section_name","garment_group_name"],axis=1,inplace=True)
df = reduce_dtype(df)
print(df.info())
del df

In [None]:
df = read_customers()
print(missing_value(df))
print(unique_data(df))

##### This shows Active and FN column contain a single value , so they are not of any use
##### Also converting age(float64) into age(int8)

In [None]:
df.drop(["FN","Active"],axis=1,inplace=True)
df["age"] = df["age"].apply(lambda x:int(x) if x else None)

In [None]:
print(r"So, I will be using Nullable Integer 😁")
df["age"] = df["age"].astype("Int8")
print(df.info())
del df

In [None]:
df = read_transactions()
df = reduce_dtype(df)
print(df.info())
del df

## After removing extra columns

| Dataset | Memory Utilization |
| --- | --- |
| Articles |  5.9+ MB |
| Customer |  44.7+ MB |
| Transactions |  818+ MB |

### Will convert csv data into Parquet  

In [None]:
df = read_articles()
df.drop(["product_type_name","product_group_name","graphical_appearance_name","colour_group_name",
         "perceived_colour_value_name","perceived_colour_master_name","department_name",
         "index_name","index_group_name","section_name","garment_group_name"],axis=1,inplace=True)
df = reduce_dtype(df)
df.to_parquet("articles.parquet")

In [None]:
%%time
articles = pd.read_parquet("./articles.parquet")
print(articles.info())
del articles

In [None]:
cust_df = read_customers()
cust_df.drop(["FN","Active"],axis=1,inplace=True)
cust_df["age"] = cust_df["age"].astype("Int8")
cust_df = reduce_dtype(cust_df)
cust_df.to_parquet("customer.parquet")
del cust_df

In [None]:
%%time
cust_df = pd.read_parquet("./customer.parquet")
print(cust_df.info())
del cust_df

In [None]:
trans_df = read_transactions()
trans_df = reduce_dtype(trans_df)
# float16 is not supported in parquet
half_floats = list(trans_df.select_dtypes(include="float16"))
trans_df[half_floats] = trans_df[half_floats].astype("float32")
trans_df.to_parquet("transactions.parquet")
del trans_df

In [None]:
%%time
trans = pd.read_parquet("./transactions.parquet")
print(trans.info())
del trans

### Will update the notebook with new techniques ,if you like it please upvote 🙏🙏