# Notebook for calculating sentiment, readability and length

In [2]:
import pandas as pd
import numpy as np

In [4]:
import gzip
import csv
import string
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
%matplotlib inline

In [5]:
#read csv

fields = ['Product_Id','Gender','Helpfulness','Review','Overall_Rating','Timestamp']
df_male = pd.read_csv('/media/backup/Data/Amazon/amazon_male.csv', sep='|',encoding='utf8',quoting=csv.QUOTE_NONE,usecols=fields)

In [6]:
not_punctuation = lambda w: not (len(w)==1 and (not w.isalpha()))
#get_word_count = lambda text: len(list(filter(not_punctuation, word_tokenize(text))))
get_sent_count = lambda text: len(sent_tokenize(text))

In [49]:
TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
SPECIAL_CHARS = ['.', ',', '!', '?']

def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words

In [8]:
nltk.download('cmudict')
prondict = cmudict.dict()

[nltk_data] Downloading package cmudict to /home/rachneet/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [9]:
def numsyllables(word):
    try:
        return [len(list(y for y in x if (y[-1]).isdigit())) for x in prondict[word.lower()]]
    except KeyError:
        return [0]

In [10]:
def text_statistics(text):
    word_count = len(get_words(text))
    sent_count = get_sent_count(text)
    #if more than one pronunciation, take the largest no. of syllables
    syllable_count = sum(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
    
    analyzedVars = {
        'word_cnt': float(word_count),
        'sentence_cnt': float(sent_count),
        'syllable_cnt': float(syllable_count),
    }
    
    return analyzedVars['word_cnt'],analyzedVars['sentence_cnt'], analyzedVars['syllable_cnt']

In [11]:
#Flesch Kincaid measure of readability

#readability ease
flesch_formula = lambda word_count, sent_count, syllable_count : 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count

def flesch(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    #print(word_count,sent_count,syllable_count)
    score = 0.0
    if word_count > 0.0:
        score = round(flesch_formula(word_count, sent_count, syllable_count))
    return score

#grade level
fk_formula = lambda word_count, sent_count, syllable_count : 0.39 * word_count / sent_count + 11.8 * syllable_count / word_count - 15.59

def flesch_kincaid(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    score = 0.0
    if word_count > 0.0:
        score = round(fk_formula(word_count, sent_count, syllable_count))
    return score

In [16]:
def length(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    return word_count

In [12]:
df_male_sample = df_male.sample(1000)

In [13]:
df_male_sample.head()

Unnamed: 0,Product_Id,Gender,Helpfulness,Review,Overall_Rating,Timestamp
7844827,B000FFJ85I,male,44,...if you are expecting AFI to cut back to the...,3.0,1150243200
6898881,B001VNB56I,male,45,This is not a review of the album per se. Let...,4.0,1242432000
6163747,B003VNCROU,male,13,I thought Dragon would be a great way for my 5...,3.0,1289260800
4147729,B000B7TU5S,male,22,Space Trilogy (C. S. Lewis)C. S. Lewis's serie...,5.0,1191628800
2110180,B002N2KJ7W,male,23,I had knee surgery a few years ago and riding ...,5.0,1358985600


In [14]:
df_male_sample['Grade_level'] = df_male_sample['Review'].apply(lambda x: flesch_kincaid(x))

In [15]:
df_male_sample.head()

Unnamed: 0,Product_Id,Gender,Helpfulness,Review,Overall_Rating,Timestamp,Grade_level
7844827,B000FFJ85I,male,44,...if you are expecting AFI to cut back to the...,3.0,1150243200,11
6898881,B001VNB56I,male,45,This is not a review of the album per se. Let...,4.0,1242432000,8
6163747,B003VNCROU,male,13,I thought Dragon would be a great way for my 5...,3.0,1289260800,13
4147729,B000B7TU5S,male,22,Space Trilogy (C. S. Lewis)C. S. Lewis's serie...,5.0,1191628800,10
2110180,B002N2KJ7W,male,23,I had knee surgery a few years ago and riding ...,5.0,1358985600,5


In [17]:
df_male_sample['length'] = df_male_sample['Review'].apply(lambda x: length(x))

In [18]:
df_male_sample.head()

Unnamed: 0,Product_Id,Gender,Helpfulness,Review,Overall_Rating,Timestamp,Grade_level,length
7844827,B000FFJ85I,male,44,...if you are expecting AFI to cut back to the...,3.0,1150243200,11,161.0
6898881,B001VNB56I,male,45,This is not a review of the album per se. Let...,4.0,1242432000,8,165.0
6163747,B003VNCROU,male,13,I thought Dragon would be a great way for my 5...,3.0,1289260800,13,158.0
4147729,B000B7TU5S,male,22,Space Trilogy (C. S. Lewis)C. S. Lewis's serie...,5.0,1191628800,10,1079.0
2110180,B002N2KJ7W,male,23,I had knee surgery a few years ago and riding ...,5.0,1358985600,5,20.0


In [21]:
df_male_sample.mean()

Overall_Rating    4.082000e+00
Timestamp         1.318792e+09
Grade_level       7.232000e+00
length            9.690200e+01
dtype: float64

In [29]:
df_male_sample[df_male_sample['Product_Id']=='B002N2KJ7W']['Review']

2110180    I had knee surgery a few years ago and riding my bike was very painful. This item solved my problem.
Name: Review, dtype: object

In [27]:
pd.set_option('display.max_colwidth', -1)

In [30]:
from  nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

nltk.download('vader_lexicon')

sia = SIA()
#results = []

# def sentiment_analyser(rev):
#     pol_score = sia.polarity_scores(rev)
#     pol_score['Review'] = rev
#     results.append(pol_score)
#     return results


df_male_sample['Sentiment'] = df_male_sample['Review'].apply(lambda x : sia.polarity_scores(x))



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/rachneet/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [32]:
words = stopwords.words("english")
#remove punctuation for each word
#maketrans() method returns a translation table that maps each character in the 
#intab string into the character at the same position in the outtab string
table = str.maketrans('','', string.punctuation)

In [40]:
def sentiment(text):
    cleaned_text = " ".join([i.translate(table) for i in text.split() if i.isalpha() if i not in words]).lower()
    return sia.polarity_scores(cleaned_text)

In [46]:
df_male = df_male.dropna()

In [41]:
df_male_sample['Sentiment'] = df_male_sample['Review'].apply(lambda x: sentiment(x))

In [47]:
df_male.count()

Product_Id        16945315
Gender            16945315
Helpfulness       16945315
Review            16945315
Overall_Rating    16945315
Timestamp         16945315
dtype: int64

In [50]:
df_male['Grade_level'] = df_male['Review'].apply(lambda x: flesch_kincaid(x))

In [51]:
import gc

In [52]:
df_male['length'] = df_male['Review'].apply(lambda x: length(x))
df_male['Sentiment'] = df_male['Review'].apply(lambda x: sentiment(x))

In [53]:
df_male.to_csv('male_l_s_r.csv',sep='|')

In [55]:
df_male.count()

Product_Id        16945315
Gender            16945315
Helpfulness       16945315
Review            16945315
Overall_Rating    16945315
Timestamp         16945315
Grade_level       16945315
length            16945315
Sentiment         16945315
dtype: int64

In [56]:
del df_male
gc.collect()

14

In [57]:
df_female = pd.read_csv('/media/backup/Data/Amazon/amazon_female.csv', sep='|',encoding='utf8',quoting=csv.QUOTE_NONE,usecols=fields)
df_female = df_female.dropna()
print('loaded dataset to memory')
df_female['Grade_level'] = df_female['Review'].apply(lambda x: flesch_kincaid(x))
print('calculated grade-level')
df_female['length'] = df_female['Review'].apply(lambda x: length(x))
print('calculated length')
df_female['Sentiment'] = df_female['Review'].apply(lambda x: sentiment(x))
print('calculated sentiment')

loaded dataset to memory
calculated grade-level
calculated length
calculated sentiment


In [58]:
df_female.to_csv('female_l_s_r.csv',sep='|')

In [59]:
df_female.count()

Product_Id        15765700
Gender            15765700
Helpfulness       15765700
Review            15765700
Overall_Rating    15765700
Timestamp         15765700
Grade_level       15765700
length            15765700
Sentiment         15765700
dtype: int64