In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from string import punctuation

import pandas as pd

In [2]:
location = "datasets/women_clothing_review.csv"
df = pd.read_csv(location)
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
# check if any missing value:
df.count()

Unnamed: 0                 23486
Clothing ID                23486
Age                        23486
Title                      19676
Review Text                22641
Rating                     23486
Recommended IND            23486
Positive Feedback Count    23486
Division Name              23472
Department Name            23472
Class Name                 23472
dtype: int64

In [4]:
# there is missing value is Review column
# create new data frame without missing value:
new_df = df.dropna(subset = ['Review Text'])
new_df.count()

Unnamed: 0                 22641
Clothing ID                22641
Age                        22641
Title                      19675
Review Text                22641
Rating                     22641
Recommended IND            22641
Positive Feedback Count    22641
Division Name              22628
Department Name            22628
Class Name                 22628
dtype: int64

In [5]:
# create a function analyzing the "Review Text" column and calculate a sentiment value:
def reviewSentiment(review):
    
    # make text lowercase:
    review = review.lower()
    
    # tokenize the review:
    tknz_review = word_tokenize(review)
    
    # remove punctuation:
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    # empty list to hold "cleaned" tokens:
    clean_tokens = []
    
    # remove filler words:
    for token in tknz_review:
        if token not in stopwords.words('english'):
            clean_tokens.append(token)
            
    # put sentence back together with remaining clean words:
    clean_review = " ".join(clean_tokens)

    # get the polarity scores dictionary:
    sid = SentimentIntensityAnalyzer()
    sid_rev = sid.polarity_scores(clean_review)
    
    # get sentiment polarity from the "compound" key in the sid_rev dictionary:
    r_comp = sid_rev['compound']
    
    # return the sentiment value:
    return r_comp

In [6]:
# create a new column to hold sentiment value from the function:
new_df['Review Sentiment'] = new_df['Review Text'].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Review Sentiment
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,0.8991
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,0.971
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117


In [8]:
new_df['Review Sentiment'].min()

-0.9393

In [18]:
new_df.loc[new_df['Review Sentiment']==new_df['Review Sentiment'].min()]

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Review Sentiment
10204,10204,1076,37,I really don't have enough bad things to say...,"I'll start by saying, over the years, i get mo...",1,0,3,General,Dresses,Dresses,-0.9393


In [22]:
new_df['Review Text'][10204]

"I'll start by saying, over the years, i get more and more frustrated by the lowered quality of retailer's products. this dress represents all of my frustration. cheap, cheap cheap material. low thread count. elastic!!! around the end of the sleeves that rubs the cheap fabric against your arm. i mean, this dress is a total disaster - and looks nothing like the photograph shown."

In [23]:
new_df.loc[new_df['Review Sentiment']==new_df['Review Sentiment'].max()]

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Review Sentiment
20043,20043,872,83,Fantastic & fun,I saw this top online and fell in love with th...,5,1,0,General Petite,Tops,Knits,0.9952


In [24]:
new_df['Review Text'][20043]

'I saw this top online and fell in love with the overall casual look.\r\ni ordered all 3 colors, and they just arrived........love love love !!!\r\nthey are soft, comfortable and a have a very flattering casual fit.\r\nthe understated ruffled details add the special "something" that is so retailer.\r\nthey are sophisticated yet pretty.\r\nthe orange is more vibrant in person and is a beautiful shade.\r\nthe plum/purple is a rich, unique and pretty color.\r\nthe navy is a perfect "denim" color.\r\na fantastic basic'