In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [3]:
reviews = pd.read_csv('reviews.csv')
print(reviews.head(5))

   clothing_id  age                review_title  \
0         1095   39  Cute,looks like a dress on   
1         1095   28       So cute, great print!   
2          699   37              So flattering!   
3         1072   36                  Effortless   
4         1094   32              You need this!   

                                         review_text  recommended  \
0  If you are afraid of the jumpsuit trend but li...         True   
1  I love fitted top dresses like this but i find...         True   
2  I love these cozy, fashionable leggings. they ...         True   
3  Another reviewer said it best, "i love the way...         True   
4  Rompers are my fav so i'm biased writing this ...         True   

    division_name department_name review_date    rating  
0         General         Dresses  2019-07-08  Liked it  
1         General         Dresses  2019-05-17  Loved it  
2       Initmates        Intimate  2019-06-24  Loved it  
3  General Petite         Dresses  2019-12-06 

In [5]:
print(reviews.columns)

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='object')


In [6]:
print(reviews.info)

<bound method DataFrame.info of       clothing_id  age                                  review_title  \
0            1095   39                    Cute,looks like a dress on   
1            1095   28                         So cute, great print!   
2             699   37                                So flattering!   
3            1072   36                                    Effortless   
4            1094   32                                You need this!   
...           ...  ...                                           ...   
4995          918   38                                Unique sweater   
4996          950   33             The brown/gray version is cropped   
4997         1086   36                                           NaN   
4998         1033   28  If you have a big booty, get these jeans now   
4999          850   64                          Makes me feel pretty   

                                            review_text  recommended  \
0     If you are afraid of the 

# Data transformation
### Transforming recommended column into binary values.

In [10]:
print(reviews['recommended'].value_counts())

True     4166
False     834
Name: recommended, dtype: int64


In [12]:
binary_dict = {True:1, False:0}

In [14]:
reviews.recommended = reviews.recommended.map(binary_dict)

In [15]:
print(reviews.recommended.value_counts())

Series([], Name: recommended, dtype: int64)


## Transforming rating feature into several number categories.

In [16]:
print(reviews['rating'].value_counts())

Loved it     2798
Liked it     1141
Was okay      564
Not great     304
Hated it      193
Name: rating, dtype: int64


In [24]:
rating_dict = {'Loved it': 5, 'Liked it': 4, 'Was okay': 3, 'Not great': 2, 'Hated it': 1}

{'Loved it': 5, 'Liked it': 4, 'Was okay': 3, 'Not great': 2, 'Hated it': 1}

In [25]:
reviews.rating = reviews.rating.map(rating_dict)
print(reviews.rating.value_counts())

Series([], Name: rating, dtype: int64)


## Transforming Department feature into one-hot-encoding.

In [26]:
print(reviews['department_name'].value_counts())

Tops        2196
Dresses     1322
Bottoms      848
Intimate     378
Jackets      224
Trend         28
Name: department_name, dtype: int64


In [29]:
one_hot = pd.get_dummies(reviews['department_name'])
print(one_hot)

      Bottoms  Dresses  Intimate  Jackets  Tops  Trend
0           0        1         0        0     0      0
1           0        1         0        0     0      0
2           0        0         1        0     0      0
3           0        1         0        0     0      0
4           0        1         0        0     0      0
...       ...      ...       ...      ...   ...    ...
4995        0        0         0        0     1      0
4996        0        0         0        0     1      0
4997        0        1         0        0     0      0
4998        1        0         0        0     0      0
4999        0        0         0        0     1      0

[5000 rows x 6 columns]


### Joinning one-hot_encoding into reviews.

In [33]:
reviews = reviews.join(one_hot)

ValueError: columns overlap but no suffix specified: Index(['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'], dtype='object')

### Transforming review_date into date-time feature.

In [37]:
reviews.review_date = pd.to_datetime(reviews.review_date)
print(reviews.review_date)
print(reviews.review_date.dtype)

0      2019-07-08
1      2019-05-17
2      2019-06-24
3      2019-12-06
4      2019-10-04
          ...    
4995   2019-05-26
4996   2019-10-21
4997   2019-10-18
4998   2019-11-24
4999   2019-10-31
Name: review_date, Length: 5000, dtype: datetime64[ns]
datetime64[ns]


### Scaling data into one scale.

In [44]:
#getting numerical columns.
reviews = reviews[['clothing_id', 'age', 'recommended', 'rating', 'Bottoms','Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()

#resetting the index
reviews = reviews.set_index(['clothing_id'])
reviews

KeyError: "['clothing_id'] not in index"

In [46]:
# Instantiating standard scaler
scaler = StandardScaler()
scaled_revs = scaler.fit_transform(reviews)
print(scaled_revs)

[[-0.34814459         nan         nan ... -0.21656679 -0.88496718
  -0.07504356]
 [-1.24475223         nan         nan ... -0.21656679 -0.88496718
  -0.07504356]
 [-0.51116416         nan         nan ... -0.21656679 -0.88496718
  -0.07504356]
 ...
 [-0.59267395         nan         nan ... -0.21656679 -0.88496718
  -0.07504356]
 [-1.24475223         nan         nan ... -0.21656679 -0.88496718
  -0.07504356]
 [ 1.68960003         nan         nan ... -0.21656679  1.12998541
  -0.07504356]]


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
