In [109]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD

# Data Description
The dataset is obtained from Kaggle (https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews)

- **Clothing ID**: Integer Categorical variable that refers to the specific piece being reviewed.
- **Age**: Positive Integer variable of the reviewers age.
- **Title**: String variable for the title of the review.
- **Review Text**: String variable for the review body. The company name is replaced by te word 'retailer'.
- **Rating**: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best.
- **Recommended IND**: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended.
- **Positive Feedback Count**: Positive Integer documenting the number of other customers who found this review positive.
- **Division Name**: Categorical name of the product high level division.
- **Department Name**: Categorical name of the product department name.
- **Class Name**: Categorical name of the product class name.

### To-do list
- load dataset
- EDA
- Preform cleanings

# Load the dataset

In [79]:
# load the dataset.
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


# EDA

In [80]:
# list of column names.
df.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [81]:
# there are 23486 rows and 11 columns.
df.shape

(23486, 11)

In [82]:
# take out the 'Unnamed: 0' and 'Clothing ID' column.
# don't think they will be useful for my analysis.
df = df.drop(['Unnamed: 0', 'Clothing ID'], axis=1)

# clean the white space from the column names.
df = df.rename(columns=lambda x: x.replace(' ', ''))

In [83]:
# there are NaN in Title, ReviewText, DivisionName, DepartmentName, ClassName column.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 9 columns):
Age                      23486 non-null int64
Title                    19676 non-null object
ReviewText               22641 non-null object
Rating                   23486 non-null int64
RecommendedIND           23486 non-null int64
PositiveFeedbackCount    23486 non-null int64
DivisionName             23472 non-null object
DepartmentName           23472 non-null object
ClassName                23472 non-null object
dtypes: int64(4), object(5)
memory usage: 1.6+ MB


### How many NAs we have?
Columns Name  | Amount of NAs | % of dataset | 
------------- | ------------- | -------------|
Title  | 3810  | ~16% | 
ReviewText  | 845  | ~3.5% | 
DivisionName  | 14  | ~0.05% | 
DepartmentName  | 14  | ~0.05% | 
ClassName  | 14  | ~0.05% | 

- Those NAs in **DivisionName**, **DepartmentName** and **ClassName** are the same, since there are only ~0.05% of those, I will drop them.
- For **ReviewText**, since we are preforming our NLP on that column primarily, we can't preform NLP if we don't have any text to analyze, so I will drop them.
- For **Title**, since it is text and I am doing NLP, eventually I probabily will need to prefrom NLP on that column and combine with the ReviewText column to see if there is any meaningful unsupervised learning results. There are 3810 (~16% of my whole dataset) NAs and I don't really want to drop all of them. So my solution is to create a new column called CombineText, which comebine the Title and ReviewText column together and makes the Title like the first sentense of the review.


### Clean the NAs

In [84]:
# dorpping NAs for 4 columns.
subset = ['ReviewText', 'DivisionName', 'DepartmentName', 'ClassName']
df = df.dropna(subset=subset)

print('Now length of df is: ', len(df))

Now length of df is:  22628


In [85]:
# first fill NAs in the Title column with space, so I can concatenate the Title and ReviewText column together.
df.Title.fillna('', inplace=True)

# create a new column named CombinedText with Title and ReviewText.
df['CombinedText'] = df.Title + ' ' + df.ReviewText

# drop the Title column.
df.drop('Title', axis=1, inplace=True)

In [88]:
# general stats for the 4 numeric columns.
df.describe()

Unnamed: 0,Age,Rating,RecommendedIND,PositiveFeedbackCount
count,22628.0,22628.0,22628.0,22628.0
mean,43.28288,4.183092,0.818764,2.631784
std,12.328176,1.115911,0.385222,5.78752
min,18.0,1.0,0.0,0.0
25%,34.0,4.0,1.0,0.0
50%,41.0,5.0,1.0,1.0
75%,52.0,5.0,1.0,3.0
max,99.0,5.0,1.0,122.0


In [89]:
# there is no more NAs
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22628 entries, 0 to 23485
Data columns (total 9 columns):
Age                      22628 non-null int64
ReviewText               22628 non-null object
Rating                   22628 non-null int64
RecommendedIND           22628 non-null int64
PositiveFeedbackCount    22628 non-null int64
DivisionName             22628 non-null object
DepartmentName           22628 non-null object
ClassName                22628 non-null object
CombinedText             22628 non-null object
dtypes: int64(4), object(5)
memory usage: 1.7+ MB


### Export as Pickle

In [129]:
df.to_pickle('cleaned_df.pkl')