# Cleaning Product Reviews Dataset

In [2]:
import pandas as pd

## Inspect

In [7]:
df = pd.read_csv("flipkart_sample_reviews.csv")

In [8]:
for col in df.columns:
    print(col)
    display(df[col].value_counts())
    print("_"*150)

ProductName


MILTON Thermosteel Flip Lid 500 ml FlaskÂ Â (Pack of 1, Silver, Steel)    9278
Name: ProductName, dtype: int64

______________________________________________________________________________________________________________________________________________________
Price


â¹648    9278
Name: Price, dtype: int64

______________________________________________________________________________________________________________________________________________________
Rate


5    6103
4    2094
3     584
1     340
2     157
Name: Rate, dtype: int64

______________________________________________________________________________________________________________________________________________________
Review


Wonderful                                       526
Must buy!                                       370
Terrific                                        369
Best in the market!                             360
Perfect product!                                353
                                               ... 
This is the real thermos                          1
Go for it !!                                      1
happy with the product and flipcart service.      1
great buy                                         1
GOOD PRODUCT                                      1
Name: Review, Length: 71, dtype: int64

______________________________________________________________________________________________________________________________________________________
Summary


Good                                                                                                                                                                                                                                                                           1067
Nice                                                                                                                                                                                                                                                                            504
Good product                                                                                                                                                                                                                                                                    464
Nice product                                                                                                                                                                

______________________________________________________________________________________________________________________________________________________


In [9]:
df.dtypes

ProductName    object
Price          object
Rate            int64
Review         object
Summary        object
dtype: object

In [10]:
df.isna().sum()

ProductName    0
Price          0
Rate           0
Review         0
Summary        0
dtype: int64

In [11]:
df.Rate.value_counts()

5    6103
4    2094
3     584
1     340
2     157
Name: Rate, dtype: int64

## Cleaning
- Handle missing values
- Enforce proper data types
- Remove rows with column shift

In [18]:
df.dropna(inplace=True)

In [19]:
def is_valid_rating(rate) -> bool:
    '''Determine if rating is valid value'''
    try:
        # valid: can be cast as int
        a = int(rate)
        return True
    except:
        # could not be cast as int, thus invalid
        return False
    
# only keep rows where rating is valid
df = df[df.Rate.apply(is_valid_rating)]

Cast data types

In [20]:
df.ProductName = df.ProductName.astype(str)
df.Review = df.Review.astype(str)
df.Summary = df.Summary.astype(str)
df.Rate = df.Rate.astype(int)

Add sentiment column
- Review classified as 'negative' if rating is 1 or 2 stars
- Review classified as 'positive' if rating is 4 or 5 stars

In [21]:
df.loc[:, 'sentiment'] = 'neutral'
df.loc[df.Rate < 3, 'sentiment'] = 'negative'
df.loc[df.Rate > 3, 'sentiment'] = 'positive'

Filter out products that have less than 30 reviews, since that is not much data to work with for topic modeling analysis.

In [22]:
review_counts = df.ProductName.value_counts()
products_to_keep = review_counts[review_counts >= 30].index.to_list()
df = df.loc[df.ProductName.isin(products_to_keep), :]

Save data to csv file

In [23]:
# df.to_csv("cleaned_product_reviews.csv", index=False)
df.to_csv("cleaned_sample.csv", index=False)