# Capstone

## Noun-Phrase and PMI Analysis

#### Installing required libraries

In [1]:
!pip --quiet install nltk
import nltk
nltk.download("vader_lexicon")
nltk.download("stopwords")

#for candidate key phrase code
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download("wordnet")
nltk.download("brown")

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import *
from nltk.tag import PerceptronTagger
from nltk.data import find
import pandas
import numpy as np
import math


import json
from collections import Counter

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/notebook/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/notebook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/notebook/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/notebook/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/notebook/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package brown to /home/notebook/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.




#### Noun Phrase - Implementation 2
Source: https://gist.github.com/alexbowe/879414

In [2]:

#setting up tagger
#(from http://stackoverflow.com/a/35964709)
PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag

lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

In [3]:
# This grammar is described in the paper by S. N. Kim,
# T. Baldwin, and M.-Y. Kan.
# Evaluating n-gram based evaluation metrics for automatic
# keyphrase extraction.
# Technical report, University of Melbourne, Melbourne 2010.


from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
stop.add('WEBSITE_FILLER')


def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 < len(word) and word.lower() not in stop)    
    return accepted        

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

def get_terms(tree):
    for leaf in leaves(tree):
        #can modify normalise to w.lower() if dont want to normalize word
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        yield term
        
def get_nounPhrases(textInput, minWordLength = 1):
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    grammar = r"""

    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
              """

    chunker = nltk.RegexpParser(grammar)
    
    toks = nltk.word_tokenize(textInput)
    #print(toks)
    pos_tag = tagger.tag
    postoks = pos_tag(toks)

    tree = chunker.parse(postoks)
    terms = get_terms(tree)
   
    nounPhraseList = []
    for tid,term in enumerate(terms):
        templist = []
        for wid, word in enumerate(term):
            #print("TID: ",tid," WID: ",(wid+1), word)
            templist.append(word)
        
        s = " "
        nounPhraseList.append(s.join(templist))

    nounPhraseList = [word for word in nounPhraseList if len(word.split())>=minWordLength]
    return nounPhraseList
    
    


In [4]:
get_nounPhrases("Noun Phrase")

['noun phrase']

#### Read JSON Data Into Pandas DF

In [5]:
file_in1 = '/resources/data//reddit_data/RC_2015-04.json'
file_in2 = '/resources/data//reddit_data/RC_2015-05.json'
file_in3= '/resources/data//reddit_data/RC_2015-06.json'
file_in =[file_in1,file_in2,file_in3]

In [6]:
dataDf = pandas.DataFrame()
#following code courtesy of: https://www.reddit.com/r/MachineLearning/comments/33eglq/python_help_jsoncsv_pandas/
for file in file_in:
    with open(file, 'r') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # i want to convert it into an *array* of JSON objects
    # which, in and of itself, is one large JSON object
    # basically... add square brackets to the beginning
    # and end, and have all the individual business JSON objects
    # separated by a comma
    data_json_str = "[" + ','.join(data) + "]"
    # now, load it into pandas
    dataDf = dataDf.append(pandas.read_json(data_json_str),ignore_index=True)
    #print(dataDf.head())


In [7]:
#len(set(dataDf.index.values))

In [8]:
#dataDf.index.values

In [9]:
#to see all headings in data
#for column in dataDf.columns:
    #print(column)

In [10]:
#Printing sample of DF
#dataDf.head()

In [11]:
#How much Data? (Rows,Columns)
dataDf.shape

(3870, 22)

In [12]:
#do analysis on a copy of the orginal text
copyDf = dataDf.copy()
dataDf['bodyCopy'] =copyDf['body']

In [13]:
#dataDf['score']

#### Filtering

In [14]:
import re
URL_REGEX_1 = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
SHOPIFY_REGEX = r"[S|s]hopify"
HIRING_REGEX = r"[H|h]iring"
POPULARCOMMENT_REGEX = r"\**Most Popular Comments\**"
HISTORY_REGEX = r"[R|r]ecent [S|s]ubmission [H|h]istory "

In [15]:
#list to store ids of comments to keep 
commentKeep =[]

for cid,comment in enumerate(dataDf['bodyCopy']):
    #print(cid)
    #print(dataDf.loc[dataDf['body']==comment].index.values[0])
    if (len(re.findall(URL_REGEX_1,comment))>0):
        dataDf['bodyCopy'][cid] = re.sub(URL_REGEX_1,'WEBSITE_FILLER',comment)
        #print(dataDf['bodyCopy'][cid])
    
# 2nd loop to check if post meets requirements after subbing in URL FILLER    
for cid,comment in enumerate(dataDf['bodyCopy']):    
    #only keep comments with links if they mention shopify w/o it occuring in link    
    if (len(re.findall(SHOPIFY_REGEX,comment))>0):
            #remove any post with 'Hiring' or 'Recent Submission History'
            if(len(re.findall(HIRING_REGEX,comment))==0 and  len(re.findall(HISTORY_REGEX,comment))==0 and len(re.findall(POPULARCOMMENT_REGEX,comment))==0):
                commentKeep.append(cid)
                
        
print("****Done****")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


****Done****


In [16]:
#re.sub(URL_REGEX_1,'WEBSITE_FILLER','www.1shopi_f453643.com')

In [17]:
#how many comments left?
print("After Filtering:",len(commentKeep),"comments have been kept from:",dataDf.shape[0])

After Filtering: 1720 comments have been kept from: 3870


In [18]:
#assign subset of comments to keep to new df

commentKeepDf=dataDf.loc[commentKeep]

In [19]:
#Check Size
commentKeepDf.shape

(1720, 23)

In [20]:
#commentKeepDf.columns

#### Looking at noun-phrases per comment

In [21]:
#for count,comment in enumerate(commentKeepDf['bodyCopy']):
    #print("Comment: " + comment)
    #print("\n Noun-phrases/Candidate Key Phrases:")
    #print(extract_candidate_chunks(comment) ,"\n\n")
    #print("----------------------------------------------------")

#### Building different counters to analyze top k frequent words in whole collection

In [22]:
#List of all 'english' stop words; stop words are common words
#from nltk.corpus import stopwords
#stop = set(stopwords.words('english'))
#stop.add('WEBSITE_FILLER')

In [23]:
#from nltk.stem import *
#wnLemm = WordNetLemmatizer()
#sbStem = SnowballStemmer("english")
#wnLemm.lemmatize(word)

In [24]:
get_nounPhrases("nOUN pHRASE", minWordLength = 2)

['noun phrase']

In [25]:
#Count the frequency of words
from collections import Counter
#https://pymotw.com/2/collections/counter.html
# counter is a container that keeps track of how many times equivalent values are added
import re
#A regular expression (or RE) specifies a set of strings that matches it
#A formal language for specifying strings
counter = Counter()
for rnumb,comment in enumerate(commentKeepDf['bodyCopy']):
        #add the word in its lowercase form to the counter Container if it is not a stop word and is greather than 2 characters
        counter.update([(word.lower()) for word in get_nounPhrases(comment, minWordLength = 2) if word.lower() not in stop])
        #print(get_nounPhrases(comment))
        if (rnumb % 1000 == 1):
            print("processed %d reviews" % (rnumb))
        
print("****Done****")


processed 1 reviews
processed 1001 reviews
****Done****


In [26]:
#counter

In [27]:
k = 100
print("Using basic counter: ")
top_k_df =pandas.DataFrame([[key,value] for key,value in counter.items()],columns=['Term','Frequency'])
top_k_df.sort_values('Frequency', axis=0, ascending=False).head(k)


Using basic counter: 


Unnamed: 0,Term,Frequency
2542,shopifi store,68
5166,good luck,39
4397,onlin store,32
691,shopifi site,27
446,ecommerc platform,27
1926,shopifi payment,23
28,credit card,18
1290,ecommerc site,18
4885,use shopifi,18
2575,web develop,17


### MI and PMI Analysis



In [28]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas
import numpy as np
import math

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.corpus import stopwords

In [29]:
! pip install --upgrade textblob

Requirement already up-to-date: textblob in ./common/.virtualenv/python3/lib/python3.4/site-packages
Requirement already up-to-date: nltk>=3.1 in /usr/local/lib/python3.4/dist-packages (from textblob)


In [30]:
from textblob import TextBlob


In [31]:
test = TextBlob("Shopify Capstone 2016 good")

In [32]:
test.sentiment.polarity

0.7

In [33]:
commentContent = commentKeepDf['bodyCopy'].as_matrix()

In [34]:
#commentKeepDf.shape

In [35]:
#Instantiation
sid = SentimentIntensityAnalyzer()

In [36]:
pdlist = []
##Assign Vader score to individual review using Vader compound score
##creating a list of the reviews along with their polarity score as assigned by Vader
##list containing a list where each element is [review,polarity]
for rownum, comment in enumerate(commentContent):
    ss = sid.polarity_scores(comment)
    textblobpol=TextBlob(comment).sentiment.polarity
    pdlist.append([comment,ss['compound'],textblobpol])
    if (rownum % 100 == 1):
            print("processed %d reviews" % (rownum+1))

            
print("****Done****")

processed 2 reviews
processed 102 reviews
processed 202 reviews
processed 302 reviews
processed 402 reviews
processed 502 reviews
processed 602 reviews
processed 702 reviews
processed 802 reviews
processed 902 reviews
processed 1002 reviews
processed 1102 reviews
processed 1202 reviews
processed 1302 reviews
processed 1402 reviews
processed 1502 reviews
processed 1602 reviews
processed 1702 reviews
****Done****


In [37]:
#pdlist

In [38]:
test = sid.polarity_scores("Shopify Capstone 2016 good")
test['compound']


0.4404

In [39]:
commentDf = pandas.DataFrame(pdlist)
commentDf.columns = ['commentCol','vader','textBlob']
commentDf.head()

Unnamed: 0,commentCol,vader,textBlob
0,At one point their shopify site had options to...,0.0,-0.151389
1,"It is ok, I've spoken to a couple executives t...",0.7269,0.22
2,Here are my honest thoughts:\n\n1) You are in ...,0.9991,0.200768
3,I don't spend that much time freelancing... ma...,0.9282,0.090774
4,"That is a great idea about the ""Sold"" versus ""...",0.8807,0.5


In [40]:
print("Number of Positive Comments based on Vader: ",(commentDf.loc[commentDf['vader']>0]['vader']).size)
print("\nNumber of Positive Comments based on textBlob: ",(commentDf.loc[commentDf['textBlob']>0]['textBlob']).size)


Number of Positive Comments based on Vader:  1339

Number of Positive Comments based on textBlob:  1315


In [41]:
#for (index,word,wordCount) in top_k_df.itertuples():
    #print(index,word,wordCount)

In [42]:
#top_k_df

In [43]:
#Find out if a particular review has the word from topk list
#note: needed to ad .lower else not matching all words

top_k = counter.most_common(k)

freqReview = []
#create a list, one entry for each review; the entry is binary list indicating whether this review as the term i or not in the top k
for i in range(len(commentDf)):
    #  counter.update([(word.lower()) for word in get_nounPhrases(comment, minWordLength = 2) if word.lower() not in stop])

    tempCounter = Counter([(word.lower()) for word in get_nounPhrases(commentDf['commentCol'][i], minWordLength = 2)])
    #extract_candidate_chunks(reviewDf['reviewCol'][i])
    #print(tempCounter)
    topkinComment = [1 if tempCounter[word] > 0 else 0 for (word,wordCount) in top_k]
    freqReview.append(topkinComment)

freqReviewDf = pandas.DataFrame(freqReview)
dfName = []
for c in top_k:
    dfName.append(c[0])
    #print(c)
freqReviewDf.columns = dfName
freqReviewDf.head()

Unnamed: 0,shopifi store,good luck,onlin store,ecommerc platform,shopifi site,shopifi payment,credit card,ecommerc site,use shopifi,big commerc,...,red flag,shopifi blog,mobil app,e-commerc store,self-servic modul,first page,account manag,link href=,free app,main page
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
finalcommentDf = commentDf.join(freqReviewDf)
#finalcommentDf.head()
finalcommentDf.shape

(1720, 103)

In [45]:

commentKeepDf_reindexed = commentKeepDf.reset_index(drop=True)
finaldf = commentKeepDf_reindexed.join(finalcommentDf)
#finaldf.head()

In [46]:
finaldf.shape

(1720, 126)

#### Frequency Analysis of top k=100 terms in Positive and Negative comments (using Vader)

In [47]:
#top k positive comments

#Count the frequency of words
from collections import Counter
#https://pymotw.com/2/collections/counter.html
# counter is a container that keeps track of how many times equivalent values are added
import re
#A regular expression (or RE) specifies a set of strings that matches it
#A formal language for specifying strings
counter = Counter()
for rnumb,comment in enumerate(commentDf.loc[commentDf['vader']>0]['commentCol']):

        
        #add the word in its lowercase form to the counter Container if it is not a stop word and is greather than 2 characters
        counter.update([(word.lower()) for word in get_nounPhrases(comment, minWordLength = 2) if word.lower() not in stop])
        if (rnumb % 100 == 1):
            print("processed %d reviews" % (rnumb))
        

topkPos_v = counter.most_common(k)


print("\n \nTop k (100) from comments with positive vader scores: \n")
topkPos_v_df =pandas.DataFrame([[key,value] for key,value in counter.items()],columns=['Term','Frequency'])
topkPos_v_df.sort_values('Frequency', axis=0, ascending=False).head(k)


processed 1 reviews
processed 101 reviews
processed 201 reviews
processed 301 reviews
processed 401 reviews
processed 501 reviews
processed 601 reviews
processed 701 reviews
processed 801 reviews
processed 901 reviews
processed 1001 reviews
processed 1101 reviews
processed 1201 reviews
processed 1301 reviews

 
Top k (100) from comments with positive vader scores: 



Unnamed: 0,Term,Frequency
2219,shopifi store,54
4557,good luck,39
3867,onlin store,27
392,ecommerc platform,24
613,shopifi site,24
3109,free ship,16
411,big commerc,15
1683,shopifi payment,15
24,credit card,15
1490,blank sleev,15


In [48]:
#top k negative comments

#Count the frequency of words
from collections import Counter
#https://pymotw.com/2/collections/counter.html
# counter is a container that keeps track of how many times equivalent values are added
import re
#A regular expression (or RE) specifies a set of strings that matches it
#A formal language for specifying strings
counter = Counter()
for rnumb,comment in enumerate(commentDf.loc[commentDf['vader']<0]['commentCol']):

        
        #add the word in its lowercase form to the counter Container if it is not a stop word and is greather than 2 characters
        counter.update([(word.lower()) for word in get_nounPhrases(comment, minWordLength = 2) if word.lower() not in stop])
        if (rnumb % 100 == 1):
            print("processed %d reviews" % (rnumb))
        

topkNeg_v = counter.most_common(k)
print("\n \nTop k (100) from comments with negative vader scores: \n")
topkNeg_v_df =pandas.DataFrame([[key,value] for key,value in counter.items()],columns=['Term','Frequency'])
topkNeg_v_df.sort_values('Frequency', axis=0, ascending=False).head(k)

processed 1 reviews
processed 101 reviews

 
Top k (100) from comments with negative vader scores: 



Unnamed: 0,Term,Frequency
101,shopifi store,8
308,shopifi payment,6
628,custom field,5
708,link href=,5
644,use shopifi,4
341,product line,3
125,ecommerc site,3
706,shopifi page,3
778,payment processor,3
640,merchant fee,3


#### Frequency Analysis of top k=100 terms in Positive and Negative comments (using textBlob)

In [49]:

    #top k positive comments

#Count the frequency of words
from collections import Counter
#https://pymotw.com/2/collections/counter.html
# counter is a container that keeps track of how many times equivalent values are added
import re
#A regular expression (or RE) specifies a set of strings that matches it
#A formal language for specifying strings
counter = Counter()
for rnumb,comment in enumerate(commentDf.loc[commentDf['textBlob']>0]['commentCol']):

        
        #add the word in its lowercase form to the counter Container if it is not a stop word and is greather than 2 characters
        counter.update([(word.lower()) for word in get_nounPhrases(comment, minWordLength = 2) if word.lower() not in stop])
        if (rnumb % 100 == 1):
            print("processed %d reviews" % (rnumb))
        

topkPos_tB = counter.most_common(k)
print("\n \nTop k (100) from comments with positive textBlob scores: \n")
topkPos_tB_df =pandas.DataFrame([[key,value] for key,value in counter.items()],columns=['Term','Frequency'])
topkPos_tB_df.sort_values('Frequency', axis=0, ascending=False).head(k)


processed 1 reviews
processed 101 reviews
processed 201 reviews
processed 301 reviews
processed 401 reviews
processed 501 reviews
processed 601 reviews
processed 701 reviews
processed 801 reviews
processed 901 reviews
processed 1001 reviews
processed 1101 reviews
processed 1201 reviews
processed 1301 reviews

 
Top k (100) from comments with positive textBlob scores: 



Unnamed: 0,Term,Frequency
2297,shopifi store,53
4682,good luck,39
3984,onlin store,23
415,ecommerc platform,20
634,shopifi site,20
1750,shopifi payment,18
104,land page,16
2329,web develop,16
3215,free ship,16
5315,ecommerc site,16


In [50]:
#top k negative comments

#Count the frequency of words
from collections import Counter
#https://pymotw.com/2/collections/counter.html
# counter is a container that keeps track of how many times equivalent values are added
import re
#A regular expression (or RE) specifies a set of strings that matches it
#A formal language for specifying strings
counter = Counter()
for rnumb,comment in enumerate(commentDf.loc[commentDf['textBlob']<0]['commentCol']):

        
        #add the word in its lowercase form to the counter Container if it is not a stop word and is greather than 2 characters
        counter.update([(word.lower()) for word in get_nounPhrases(comment, minWordLength = 2) if word.lower() not in stop])
        if (rnumb % 100 == 1):
            print("processed %d reviews" % (rnumb))
        

topkNeg_tB = counter.most_common(k)
print("\n \nTop k (100) from comments with negative textBlob scores: \n")
topkNeg_tB_df =pandas.DataFrame([[key,value] for key,value in counter.items()],columns=['Term','Frequency'])
topkNeg_tB_df.sort_values('Frequency', axis=0, ascending=False).head(k)


processed 1 reviews
processed 101 reviews

 
Top k (100) from comments with negative textBlob scores: 



Unnamed: 0,Term,Frequency
130,blank sleev,15
150,shopifi store,9
401,onlin store,5
86,big commerc,4
48,ecommerc platform,4
451,shopifi payment,4
373,shopifi site,4
294,prepaid card,3
82,person inform,3
318,creep case,3


### Mutual Information

#### Based on Vader

In [51]:
#creating a binary list indicating whether a review is positive (1) or negative (0)
gtScore = []
for i in range(len(finaldf)):
    if finaldf['vader'][i]>0:
        gtScore.append(1)
    else:
        gtScore.append(0)

In [52]:
#Calculate mutual information score using scikit lean package
import sklearn
import sklearn.metrics as metrics
miScore = []
for word in top_k:
    #two inputs for MI, both are binary, first indicate the class (positive or negative), second indicates if term is present in reveiw (0 = no , 1 = yes)
    #need to know which reviews are p or n and which actually contain the term to get MI
    miScore.append([word[0]]+[metrics.mutual_info_score(gtScore,finaldf[word[0]].as_matrix())])
    #question these are the MI scores for positive reviews, how do we get them for negative reviews
miScoredf = pandas.DataFrame(miScore).sort_values(1,ascending=0)
miScoredf.columns = ['Word','MI Score']
print("MI Scores based on Vader Sentiment: \n")
miScoredf.head(50)

MI Scores based on Vader Sentiment: 



Unnamed: 0,Word,MI Score
1,good luck,0.005603
12,free ship,0.001607
32,whole thing,0.001461
25,littl bit,0.001461
28,bank account,0.001168
53,host solut,0.001168
38,much time,0.001168
33,free theme,0.001168
24,front page,0.001168
3,ecommerc platform,0.001161


#### Based on textBlob

In [53]:
#creating a binary list indicating whether a review is positive (1) or negative (0)
gtScore = []
for i in range(len(finaldf)):
    if finaldf['textBlob'][i]>0:
        gtScore.append(1)
    else:
        gtScore.append(0)
#Calculate mutual information score using scikit lean package
import sklearn
import sklearn.metrics as metrics
miScore = []
for word in top_k:
    #two inputs for MI, both are binary, first indicate the class (positive or negative), second indicates if term is present in reveiw (0 = no , 1 = yes)
    #need to know which reviews are p or n and which actually contain the term to get MI
    miScore.append([word[0]]+[metrics.mutual_info_score(gtScore,finaldf[word[0]].as_matrix())])
    #question these are the MI scores for positive reviews, how do we get them for negative reviews
miScoredf = pandas.DataFrame(miScore).sort_values(1,ascending=0)
miScoredf.columns = ['Word','MI Score']
print("MI Scores based on textBlob Sentiment: \n")

miScoredf.head(50)

MI Scores based on textBlob Sentiment: 



Unnamed: 0,Word,MI Score
1,good luck,0.006008
17,monthli fee,0.002038
18,small busi,0.001881
12,free ship,0.001723
11,land page,0.001723
25,littl bit,0.001566
32,whole thing,0.001566
28,bank account,0.001252
24,front page,0.001252
44,web design,0.001252


In [54]:
commentKeepDf.shape

(1720, 23)

In [55]:
commentDf.shape

(1720, 3)

In [56]:
#commentKeepDf.columns
#commentDf.columns
#finalcommentDf = commentKeepDf.copy()
#finalcommentDf.rename(columns={'bodyCopy':'commentCol'}, inplace=True)
#finalcommentDf.head()
#finalcommentDf.index.values
#finalcommentDf = finalcommentDf.reset_index(drop=True)
#finalcommentDf.index.values
#finalcommentDf.head()
#finalcommentDf.join(commentDf['vader']).head(n=2)

In [1]:
git init

SyntaxError: invalid syntax (<ipython-input-1-2a402a90b83b>, line 1)