# Cleaning Product Reviews Dataset

In [1]:
import pandas as pd

## Inspect

In [2]:
df = pd.read_csv("flipkart_product.csv")

In [3]:
df.head()

Unnamed: 0,ProductName,Price,Rate,Review,Summary
0,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Super!,Great cooler.. excellent air flow and for this...
1,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Awesome,Best budget 2 fit cooler. Nice cooling
2,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,The quality is good but the power of air is de...
3,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",1,Useless product,Very bad product it's a only a fan
4,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,Ok ok product


In [4]:
for col in df.columns:
    print(col)
    display(df[col].value_counts())
    print("_"*150)

ProductName


MILTON Thermosteel Flip Lid 500 ml FlaskÂ Â (Pack of 1, Silver, Steel)                                                     9278
cello Pack of 18 Opalware Cello Dazzle Lush Fiesta Opalware Dinner Set, 18 Pieces Dinner SetÂ Â (White, Microwave Safe)    8870
Home Sizzler 153 cm (5.02 ft) Polyester Room Darkening Window Curtain (Pack Of 2)Â Â (Floral, Brown)                       4350
CMerchants Multi Organiser BLue-4 Book Shelf Metal Open Book ShelfÂ Â (Finish Color - BLUE, DIY(Do-It-Yourself))           2399
Kadio Analog 20 cm X 20 cm Wall ClockÂ Â (Beige, With Glass, Standard)                                                     2380
                                                                                                                           ... 
38024PP25 Minimalists Analog Watch  - For Men                                                                                 9
LS2727 Avatar Day and Date Functioning Crocodile Strap Analog Analog Watch  - For Men                   

______________________________________________________________________________________________________________________________________________________
Price


â¹1,299                                                       10005
â¹648                                                          9278
â¹299                                                          6697
â¹339                                                          2542
â¹13,999                                                       2518
                                                               ...  
??8,499                                                          10
â¹404                                                             5
Nova Plus Amaze NI 10 1100 W Dry Iron?ÿ?ÿ(Grey & Turquoise)        1
Pigeon Favourite Electric Kettle?ÿ?ÿ(1.5 L, Silver, Black)         1
Bajaj DX 2 L/W Dry Iron                                            1
Name: Price, Length: 561, dtype: int64

______________________________________________________________________________________________________________________________________________________
Rate


5                                                              108694
4                                                               39653
1                                                               19607
3                                                               15681
2                                                                6234
Pigeon Favourite Electric Kettle?ÿ?ÿ(1.5 L, Silver, Black)          1
Bajaj DX 2 L/W Dry Iron                                             1
Nova Plus Amaze NI 10 1100 W Dry Iron?ÿ?ÿ(Grey & Turquoise)         1
s                                                                   1
Name: Rate, dtype: int64

______________________________________________________________________________________________________________________________________________________
Review


Nan                                                            20862
Wonderful                                                       8918
Awesome                                                         5506
Worth every penny                                               5464
Classy product                                                  5437
                                                               ...  
Bajaj DX 2 L/W Dry Iron                                            1
Nova Plus Amaze NI 10 1100 W Dry Iron?ÿ?ÿ(Grey & Turquoise)        1
Great product.....with this price range                            1
best juicer                                                        1
Well!!!                                                            1
Name: Review, Length: 1264, dtype: int64

______________________________________________________________________________________________________________________________________________________
Summary


Good                                                                           14175
Nice                                                                            7859
Good product                                                                    5081
Nice product                                                                    3637
Super                                                                           2643
                                                                               ...  
Very bad don't buy cheap quality                                                   1
Super â¤ï¸ð                                                                   1
Too bad the clock is not working  properly even one day also don't buy this        1
Fantastic wall clock really love itð                                            1
Thanks â¤ï¸                                                                      1
Name: Summary, Length: 98343, dtype: int64

______________________________________________________________________________________________________________________________________________________


In [5]:
df.dtypes

ProductName    object
Price          object
Rate           object
Review         object
Summary        object
dtype: object

In [6]:
df.isna().sum()

ProductName     0
Price           1
Rate            1
Review          4
Summary        14
dtype: int64

In [7]:
df.Rate.value_counts()

5                                                              108694
4                                                               39653
1                                                               19607
3                                                               15681
2                                                                6234
Pigeon Favourite Electric Kettle?ÿ?ÿ(1.5 L, Silver, Black)          1
Bajaj DX 2 L/W Dry Iron                                             1
Nova Plus Amaze NI 10 1100 W Dry Iron?ÿ?ÿ(Grey & Turquoise)         1
s                                                                   1
Name: Rate, dtype: int64

## Cleaning
- Handle missing values
- Enforce proper data types
- Remove rows with column shift

In [5]:
df.dropna(inplace=True)

In [6]:
def is_valid_rating(rate) -> bool:
    '''Determine if rating is valid value'''
    try:
        # valid: can be cast as int
        a = int(rate)
        return True
    except:
        # could not be cast as int, thus invalid
        return False
    
# only keep rows where rating is valid
df = df[df.Rate.apply(is_valid_rating)]

Cast data types

In [7]:
df.ProductName = df.ProductName.astype(str)
df.Review = df.Review.astype(str)
df.Summary = df.Summary.astype(str)
df.Rate = df.Rate.astype(int)

Add sentiment column
- Review classified as 'negative' if rating is 1 or 2 stars
- Review classified as 'positive' if rating is 4 or 5 stars

In [8]:
df.loc[:, 'sentiment'] = 'neutral'
df.loc[df.Rate < 3, 'sentiment'] = 'negative'
df.loc[df.Rate > 3, 'sentiment'] = 'positive'

Filter out products that have less than 30 reviews, since that is not much data to work with for topic modeling analysis.

In [9]:
review_counts = df.ProductName.value_counts()
products_to_keep = review_counts[review_counts >= 30].index.to_list()
df = df.loc[df.ProductName.isin(products_to_keep), :]

Save data to csv file

In [10]:
df.to_csv("cleaned_product_reviews.csv", index=False)