# Encoding Categorical Clothing Review Data

In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

In [64]:
#import data
reviews = pd.read_csv("reviews.csv")

In [65]:
# Clean up nulls
reviews.dropna(inplace=True) 

In [66]:
# print column names
print(reviews.columns)

# print .info
reviews.info()

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4169 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      4169 non-null   int64 
 1   age              4169 non-null   int64 
 2   review_title     4169 non-null   object
 3   review_text      4169 non-null   object
 4   recommended      4169 non-null   bool  
 5   division_name    4169 non-null   object
 6   department_name  4169 non-null   object
 7   review_date      4169 non-null   object
 8   rating           4169 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 297.2+ KB


## Binary Encoding: Labels

In [67]:
# look at the counts of recommended
print(reviews["recommended"].value_counts())

True     3459
False     710
Name: recommended, dtype: int64


In [68]:
#create binary dictionary + transform column

binary_dict = {True: 1, False: 0}
reviews["recommended"] = reviews["recommended"].map(binary_dict)

print(reviews["recommended"].value_counts())

1    3459
0     710
Name: recommended, dtype: int64


## Ordinal Encoding: Ratings

In [69]:
#look at the counts of rating
print(reviews["rating"].value_counts())
 

Loved it     2296
Liked it      973
Was okay      484
Not great     261
Hated it      155
Name: rating, dtype: int64


In [70]:
# Create dictionary and transform column
ratings_dict = {"Hated it": 1, "Not Great": 2, "Was okay": 3, "Liked it": 4, "Loved it": 5}
reviews["rating"] = reviews["rating"].map(ratings_dict)

# print and check
print(reviews["rating"].value_counts())

5.0    2296
4.0     973
3.0     484
1.0     155
Name: rating, dtype: int64


## One Hot Encoding: Departments

In [71]:
#get the number of categories in department feature

print(reviews["department_name"].value_counts())

Tops        1825
Dresses     1112
Bottoms      711
Intimate     307
Jackets      189
Trend         25
Name: department_name, dtype: int64


In [72]:
# encode column with ohc => add to dataset
dept_encoded = pd.get_dummies(reviews["department_name"])
reviews = reviews.join(dept_encoded)

# check results
print(dept_encoded.head())

   Bottoms  Dresses  Intimate  Jackets  Tops  Trend
0        0        1         0        0     0      0
1        0        1         0        0     0      0
2        0        0         1        0     0      0
3        0        1         0        0     0      0
4        0        1         0        0     0      0


## Date-Time Encoding: Review Date

In [73]:
#print review_date data type 
print((reviews["review_date"].dtypes))

object


In [74]:
#transform review_date to date-time data
reviews["review_date"] = pd.to_datetime(reviews["review_date"])

## Select Features(x) and Lables(y)

In [75]:
x = reviews[[col for col in reviews.columns if reviews[col].dtype == "int" or reviews[col].dtype == "uint8" or reviews[col].dtype == "int64" or reviews[col].dtype == "float"]]
x = x.set_index(x["clothing_id"])
y = reviews["recommended"] 

## Scale Data

In [None]:
x = StandardScaler().fit_transform(x)

## Split the Data => Train  & Test sets

In [80]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.99, test_size=0.01) 

# END => Data is ready to go