In [200]:
import numpy as np
import pandas as pd
# import matplotlib as plt
# import seaborn as sns
# from transformers import pipeline
from tqdm.notebook import tqdm
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [201]:
df = pd.read_json('data/modcloth_final_data.json',lines=True)
df.columns = [x.replace(" ","_") for x in df.columns]
# print(df.columns)
df.head(5)

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,bust,height,user_name,length,fit,user_id,shoe_size,shoe_width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,


In [202]:
df.isnull().mean().sort_values(ascending=False) * 100

## We will ignore bust, shoe width, <later on shoe size and waist from our features as well >
# raw.info()

waist             96.518903
bust              85.681846
shoe_width        77.525063
shoe_size         66.282160
hips              32.281677
review_text        8.122962
review_summary     8.122962
cup_size           7.555260
bra_size           7.268994
height             1.337118
quality            0.082136
length             0.042276
user_name          0.000000
fit                0.000000
user_id            0.000000
category           0.000000
size               0.000000
item_id            0.000000
dtype: float64

In [203]:
# raw.loc[raw['review_text'].str.len() > 1200]

# sentiment_analysis = pipeline("sentiment-analysis")
# def review_text_transform(x):
#     if not isinstance(x, str): return x
#     if len(x) >= 512 : return math.nan
#     label = sentiment_analysis(x)[0]['label']
#     if label == 'POSITIVE':
#         return 1
#     else:
#         return 0


# for i in tqdm(range(len(raw['review_text']))):
#     raw['review_text'][i] = review_text_transform(raw['review_text'][i])

# raw.to_csv('data/sentiment.csv')

In [204]:
df.drop(['user_name','length','review_summary', 'review_text', 'bust', 'shoe_width'],axis=1, inplace=True)

In [205]:
df['cup_size'].unique()

array(['d', 'b', 'dd/e', 'c', 'ddd/f', 'dddd/g', 'i', 'a', nan, 'h', 'aa',
       'j', 'k'], dtype=object)

In [206]:
def transform_cup_size(x):
    if not isinstance(x, str):
        return x
    if x == 'd': return 4
    if x == 'b': return 2
    if x == 'dd/e': return 5
    if x == 'c': return 3
    if x == 'ddd/f' : return 6
    if x == 'dddd/g': return 7
    if x == 'i': return 9
    if x == 'a': return 1
    if x == 'h': return 8
    if x == 'aa': return 0.5
    if x == 'j': return 10
    if x == 'k': return 11

In [207]:
df['cup_size'] = df['cup_size'].apply(transform_cup_size)
df

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,height,fit,user_id,shoe_size
0,123373,29.0,7,5.0,4.0,38.0,34.0,new,5ft 6in,small,991571,
1,123373,31.0,13,3.0,2.0,30.0,36.0,new,5ft 2in,small,587883,
2,123373,30.0,7,2.0,2.0,,32.0,new,5ft 7in,small,395665,9.0
3,123373,,21,5.0,5.0,,,new,,fit,875643,
4,123373,,18,5.0,2.0,,36.0,new,5ft 2in,small,944840,
...,...,...,...,...,...,...,...,...,...,...,...,...
82785,807722,,8,4.0,2.0,,36.0,outerwear,5ft 8in,fit,727820,8.5
82786,807722,,12,5.0,6.0,,34.0,outerwear,5ft 5in,small,197040,
82787,807722,,12,5.0,7.0,36.0,32.0,outerwear,5ft 4in,fit,102493,
82788,807722,,12,4.0,,,,outerwear,5ft 3in,fit,756491,


In [208]:
def transform_height(x):
    if not isinstance(x,str):
        return x
    x = x.split(' ')
    feet = int(x[0][:-2])
    if len(x) == 2:
        inch = int(x[1][:-2])
    else:
        inch = 0
    cm = (30.48 * feet) + (2.54 * inch)
    return cm

df['height'] = df['height'].apply(transform_height)
df.head(5)

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,height,fit,user_id,shoe_size
0,123373,29.0,7,5.0,4.0,38.0,34.0,new,167.64,small,991571,
1,123373,31.0,13,3.0,2.0,30.0,36.0,new,157.48,small,587883,
2,123373,30.0,7,2.0,2.0,,32.0,new,170.18,small,395665,9.0
3,123373,,21,5.0,5.0,,,new,,fit,875643,
4,123373,,18,5.0,2.0,,36.0,new,157.48,small,944840,


In [209]:
df['fit'].unique()

def transform_fit(x):
    if x == 'small':
        return 0
    if x == 'fit':
        return 1
    if x == "large":
        return 2

df['fit'] = df['fit'].apply(transform_fit)
df.head(5)

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,height,fit,user_id,shoe_size
0,123373,29.0,7,5.0,4.0,38.0,34.0,new,167.64,0,991571,
1,123373,31.0,13,3.0,2.0,30.0,36.0,new,157.48,0,587883,
2,123373,30.0,7,2.0,2.0,,32.0,new,170.18,0,395665,9.0
3,123373,,21,5.0,5.0,,,new,,1,875643,
4,123373,,18,5.0,2.0,,36.0,new,157.48,0,944840,


In [210]:

median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
frequent_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [211]:
numeric_columns = ['waist', 'size', 'quality', 'hips', 'bra_size', 'shoe_size']
string_columns = ['cup_size', 'height']

for col in numeric_columns:
    df[col] = median_imputer.fit_transform(np.array(df[col]).reshape(-1,1))
    
for col in string_columns:
    df[col] = frequent_imputer.fit_transform(np.array(df[col]).reshape(-1,1))


df.head(5)

  mode = stats.mode(array)
  mode = stats.mode(array)


Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,height,fit,user_id,shoe_size
0,123373,29.0,7.0,5.0,4.0,38.0,34.0,new,167.64,0,991571,8.0
1,123373,31.0,13.0,3.0,2.0,30.0,36.0,new,157.48,0,587883,8.0
2,123373,30.0,7.0,2.0,2.0,39.0,32.0,new,170.18,0,395665,9.0
3,123373,30.0,21.0,5.0,5.0,39.0,36.0,new,162.56,1,875643,8.0
4,123373,30.0,18.0,5.0,2.0,39.0,36.0,new,157.48,0,944840,8.0


In [212]:
numeric_columns = ['waist', 'size', 'quality', 'hips', 'bra_size', 'height', 'shoe_size']
scaler = StandardScaler()
df_records_numeric = pd.DataFrame(scaler.fit_transform(df.loc[:, numeric_columns]), columns=numeric_columns)

categorical_columns = ['item_id', 'category', 'cup_size', 'user_id', 'fit']
ordinal_enc = OrdinalEncoder()

df_records_categorical = pd.DataFrame(np.array(ordinal_enc.fit_transform(df.loc[:, categorical_columns]), 
                                           dtype=np.int64), columns=categorical_columns)


In [213]:
df_records_categorical.head()

Unnamed: 0,item_id,category,cup_size,user_id,fit
0,0,2,4,47557,0
1,0,2,2,28324,0
2,0,2,2,19037,0
3,0,2,5,42142,1
4,0,2,2,45350,0


In [214]:
df = pd.concat([df_records_numeric, df_records_categorical], axis=1)
df.head(3)

Unnamed: 0,waist,size,quality,hips,bra_size,height,shoe_size,item_id,category,cup_size,user_id,fit
0,-1.027082,-0.684438,1.058979,-0.396924,-0.6357,0.306343,-0.063125,0,2,4,47557,0
1,0.936889,0.040909,-0.956397,-2.050813,0.008324,-1.1039,-0.063125,0,2,2,28324,0
2,-0.045096,-0.684438,-1.964085,-0.190188,-1.279724,0.658904,1.220767,0,2,2,19037,0


In [215]:
# Number of embeddings required for each categorical variable
for col in df_records_categorical.columns:
    print(col, ':', len(df_records_categorical[col].unique()))

item_id : 1378
category : 7
cup_size : 12
user_id : 47958
fit : 3


In [216]:
df_train, df_val_test = train_test_split(df, random_state=10, test_size=0.2)
df_val, df_test = train_test_split(df_val_test, random_state=10, test_size=0.5)

In [217]:
save_file = 'data/modcloth_final_data_processed'

df_train.to_csv (save_file+'_train.csv', index=False)
df_val.to_csv(save_file+'_valid.csv', index = False)
df_test.to_csv(save_file+'_test.csv', index = False)