In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [27]:
# import data
reviews = pd.read_csv('reviews.csv')

In [28]:
# print dataframe info
print(reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB
None


In [29]:
# print dataframe head
print(reviews.head())

   clothing_id  age                review_title  \
0         1095   39  Cute,looks like a dress on   
1         1095   28       So cute, great print!   
2          699   37              So flattering!   
3         1072   36                  Effortless   
4         1094   32              You need this!   

                                         review_text  recommended  \
0  If you are afraid of the jumpsuit trend but li...         True   
1  I love fitted top dresses like this but i find...         True   
2  I love these cozy, fashionable leggings. they ...         True   
3  Another reviewer said it best, "i love the way...         True   
4  Rompers are my fav so i'm biased writing this ...         True   

    division_name department_name review_date    rating  
0         General         Dresses  2019-07-08  Liked it  
1         General         Dresses  2019-05-17  Loved it  
2       Initmates        Intimate  2019-06-24  Loved it  
3  General Petite         Dresses  2019-12-06 


##### Transform the 'recommended' feature

In [30]:
print(reviews['recommended'].value_counts())


True     4166
False     834
Name: recommended, dtype: int64



Since this is a True/False feature, we want to transform it to 1 for True and 0 for False.

In [31]:
binary_dict = {True:0 , False:1}
reviews['recomended'] = reviews['recommended'].map(binary_dict)

##### Transform the 'rating' feature


We want to make the following changes to the values:

‘Loved it’ → 5  
‘Liked it’ → 4  
‘Was okay’ → 3  
‘Not great’ → 2  
‘Hated it’ → 1

In [32]:
rating_dict = {'Loved it':5 , 'Liked it':4, 'Was okay':3, 'Not great':2, 'Hated it':1}
reviews['rating'] = reviews['rating'].map(rating_dict)


##### One-hot enconde the 'department_name' feature

In [33]:
print(reviews['department_name'].value_counts())


Tops        2196
Dresses     1322
Bottoms      848
Intimate     378
Jackets      224
Trend         28
Name: department_name, dtype: int64


In [34]:
one_hot = pd.get_dummies(reviews['department_name'])
print(one_hot)


      Bottoms  Dresses  Intimate  Jackets  Tops  Trend
0           0        1         0        0     0      0
1           0        1         0        0     0      0
2           0        0         1        0     0      0
3           0        1         0        0     0      0
4           0        1         0        0     0      0
...       ...      ...       ...      ...   ...    ...
4995        0        0         0        0     1      0
4996        0        0         0        0     1      0
4997        0        1         0        0     0      0
4998        1        0         0        0     0      0
4999        0        0         0        0     1      0

[5000 rows x 6 columns]


In [35]:
reviews = reviews.join(one_hot)
print(reviews.columns)


Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating',
       'recomended', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops',
       'Trend'],
      dtype='object')


##### Transform 'review_date' feature type

In [36]:
reviews['review_date'] = pd.to_datetime(reviews['review_date'])
 
print(reviews['review_date'].dtype)

datetime64[ns]


##### Scaling data
We notice that we have a wide range of numbers thus far, so it is best to put everything on the same scale.

In [37]:
reviews = reviews[['clothing_id', 'age', 'recommended', 'rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()
reviews = reviews.set_index('clothing_id')

scaler = StandardScaler()
scaler.fit_transform(reviews)


array([[-0.34814459,  0.44742824, -0.1896478 , ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-1.24475223,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-0.51116416,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       ...,
       [-0.59267395,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-1.24475223,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [ 1.68960003,  0.44742824,  0.71602461, ..., -0.21656679,
         1.12998541, -0.07504356]])