In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# progress bar
from tqdm.notebook import tqdm

# nltk tools for vader
# lexicon is vaders bag of words
# need to run it only once
# nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/edwige/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## enviornment and df prep

In [2]:
# checking out a clean style we don't have to use it in final presentations
plt.style.use('ggplot')

In [3]:
# importing test csv from jons scraping. 
# Change this path in pipeline
df = pd.read_csv('scraper_gmaps/data/newest_gm_reviews.csv')

In [4]:
 # df.head(20)

In [5]:
# just taking rating and review
df = df[['caption', 'rating']]

In [6]:
df

Unnamed: 0,caption,rating
0,,5.0
1,,5.0
2,Another review my daughter and friends love ...,4.0
3,,4.0
4,Can’t get your order right for nothing and the...,1.0
...,...,...
10745,,4.0
10746,Kinda pricey but great food n service,5.0
10747,Perfect. Great food.,5.0
10748,I love it! There is a wide variety of food sol...,5.0


In [7]:
# dropped null reviews
df = df.dropna(subset=['caption'])

In [8]:
df.head()

Unnamed: 0,caption,rating
2,Another review my daughter and friends love ...,4.0
4,Can’t get your order right for nothing and the...,1.0
5,We always buy when we leave early we have a co...,5.0
6,Great,4.0
7,Poor customer service by manager,1.0


In [9]:
df.shape

(6556, 2)

In [10]:
# not sure how long things will take to run 
# so making a smaller df
vader_df = df.head(50)

In [11]:
vader_df.shape

(50, 2)

In [12]:
# reindexing so that I can call rows
vader_df = vader_df.reset_index(drop=True)

In [13]:
vader_df

Unnamed: 0,caption,rating
0,Another review my daughter and friends love ...,4.0
1,Can’t get your order right for nothing and the...,1.0
2,We always buy when we leave early we have a co...,5.0
3,Great,4.0
4,Poor customer service by manager,1.0
5,Employee was a bit moody but food bussing,4.0
6,I haven't had Dunkin Donuts in years. Being th...,1.0
7,Great coffee,5.0
8,"Disappointed, no blueberry donut. Mmmm mmmm, smh",3.0
9,These Guys are really professional. Well behav...,5.0


In [14]:
# reading random review
vader_df.caption[40]

"I'm here everyday and the employees are the friendliest I've come across."

## vader tests

In [15]:
# cool test cases
# emoji
print(vader_df.caption[46])

# emoji
print(vader_df.caption[12])

# slang
print(vader_df.caption[5])

# slang
print(vader_df.caption[48])

👎
😎👍🏻👍🏻
Employee was a bit moody but food bussing
We stopped in on the fly, first visit.. the burrata pepperoni slices were incredible and the Nutella pizza was perfect to share afterwards. It’s going to be our destination pizza place. 100% recommend!


In [16]:
sia = SentimentIntensityAnalyzer()

In [17]:
# sia test
sia.polarity_scores('My back is fucking killing me!')

{'neg': 0.499, 'neu': 0.501, 'pos': 0.0, 'compound': -0.7171}

The sia correctly predicts the sentiment of the above text with a negative rating of -0.7171

In [18]:
# sia test 2
sia.polarity_scores('The Boys spinnoff was better than the original.')

{'neg': 0.0, 'neu': 0.536, 'pos': 0.464, 'compound': 0.6369}

The sia correctly predicts the sentiment of the above text with a positive rating of 0.6369

In [19]:
# running sia on in sample examples 

In [20]:
print(vader_df.caption[46])
sia.polarity_scores(vader_df.caption[46])

👎


{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}

Does not recognize emoji

In [21]:
print(vader_df.caption[12])
sia.polarity_scores(vader_df.caption[12])

😎👍🏻👍🏻


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Does not recognize emoji

In [22]:
print(vader_df.caption[5])
sia.polarity_scores(vader_df.caption[5])

Employee was a bit moody but food bussing


{'neg': 0.226, 'neu': 0.774, 'pos': 0.0, 'compound': -0.1901}

incorrectly labels positive slang but picks up negative

In [23]:
print(vader_df.caption[48])
sia.polarity_scores(vader_df.caption[48])

We stopped in on the fly, first visit.. the burrata pepperoni slices were incredible and the Nutella pizza was perfect to share afterwards. It’s going to be our destination pizza place. 100% recommend!


{'neg': 0.048, 'neu': 0.732, 'pos': 0.22, 'compound': 0.7777}

Picks up poitive and seems to ignore slang. Seems to work better with long text as it has more to go on.

## vader time

In [24]:
vader_df.shape

(50, 2)

In [27]:
# running our polarity score on our test vader dataset
# trying out the tqdm progress bar which will be 
## usefull for full dataset of reviews
# store results in dictionary vader_results
'''vader_results = {}
for i, row in tqdm(vader_df.iterrows(), total=len(vader_df)):
    review = row['caption']
    index_id = i
    vader_results[index_id] = sia.polarity_scores(review)'''

  0%|          | 0/50 [00:00<?, ?it/s]

I cannot get the tqdm progress bar to run. th efunction works but the progress bar just doesn't run. Can go back to using print statements.

### If you try to use the cmis instead of index than the results will get overwritten.

In [28]:
len(vader_results)

50

In [29]:
vader_results = {}
for i, row in vader_df.iterrows():
    review = row['caption']
    index_id = i
    vader_results[index_id] = sia.polarity_scores(review)

In [31]:
# vader_results

In [37]:
vader_results_df = pd.DataFrame(vader_results).T

In [39]:
from nltk.sentiment import SentimentIntensityAnalyzer

def vader_sentiment_analysis(df):
    '''
    Takes in a dataframe with 2 columns named:
    camis, reviews_sw
    
    reviews_sw = block of text minus stop words 
    
    returns a dataframe of five collums:
    camis, neg, neu, pos, compound
    '''
    
    sia = SentimentIntensityAnalyzer()
    vader_results = {}
    
    for i, row in df.iterrows():
        review = row['reviews_sw']
        index_id = i
        vader_results[index_id] = sia.polarity_scores(review)

        # Transposes vader results so that data is the long way.
        vader_results_df = pd.DataFrame(vader_results).T
        
    return vader_results_df
    
    

In [38]:
vader_results_df

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.523,0.477,0.8074
1,0.0,1.0,0.0,0.0
2,0.079,0.921,0.0,-0.0516
3,0.0,0.0,1.0,0.6249
4,0.437,0.563,0.0,-0.4767
5,0.226,0.774,0.0,-0.1901
6,0.028,0.95,0.022,0.0772
7,0.0,0.196,0.804,0.6249
8,0.655,0.345,0.0,-0.765
9,0.0,0.54,0.46,0.7703


now that I know what the results look like I am going ot put everything in a dataframe so that it it easier to follow for others

In [33]:
# ignore awnser done above
# code isn't workign so we are just converthing dictionary to dataframe. 
# I think it's cause I started the dataframe with teh discored shape
# will debug or rewrite if dealing with dictionary to df is a problem

'''vader_results_df = pd.DataFrame(columns=['negative','neutral','positive','compound'])
for i, row in vader_df.iterrows():
    review = row['caption']
    scores = sia.polarity_scores(review)
    vader_results_df = vader_results_df.append(scores)'''

"vader_results_df = pd.DataFrame(columns=['negative','neutral','positive','compound'])\nfor i, row in vader_df.iterrows():\n    review = row['caption']\n    scores = sia.polarity_scores(review)\n    vader_results_df = vader_results_df.append(scores)"

## function and function test

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

def vader_sentiment_analysis(df):
    '''
    Takes in a dataframe with 2 columns named:
    camis, reviews_sw
    
    reviews_sw = block of text minus stop words 
    
    returns a dataframe of five collums:
    camis, neg, neu, pos, compound
    '''
    
    sia = SentimentIntensityAnalyzer()
    vader_results = {}
    
    for i, row in df.iterrows():
        review = row['reviews_sw']
        index_id = i
        vader_results[index_id] = sia.polarity_scores(review)

        # Transposes vader results so that data is the long way.
        vader_results_df = pd.DataFrame(vader_results).T
        
    return vader_results_df
    

In [44]:
function_test_df = pd.read_csv('scraper_gmaps/data/newest_gm_reviews.csv')
function_test_df = df[['caption', 'rating']]
function_test_df.rename(columns={'caption': 'reviews_sw'}, inplace=True)
function_test_df = function_test_df.head(100)

In [45]:
output_function_test_df = vader_sentiment_analysis(function_test_df)

In [46]:
output_function_test_df

Unnamed: 0,neg,neu,pos,compound
2,0.000,0.523,0.477,0.8074
4,0.000,1.000,0.000,0.0000
5,0.079,0.921,0.000,-0.0516
6,0.000,0.000,1.000,0.6249
7,0.437,0.563,0.000,-0.4767
...,...,...,...,...
174,0.000,0.566,0.434,0.7482
175,0.000,0.655,0.345,0.5789
177,0.136,0.864,0.000,-0.2960
178,0.000,0.815,0.185,0.7096
