# Kaggle 
# Sentiment Analysis

### I will first look into a basic prediction model for review type (good or bad)
### Then I will look into developing a ***Subject Finder***
    - This will hopefully give a classification subject for each review (movie,restaurant,item)

In [73]:
import string
import numpy as np
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords,genesis
from nltk import wordpunct_tokenize,FreqDist
from sklearn.model_selection import train_test_split
from nltk.collocations import TrigramAssocMeasures,TrigramCollocationFinder

In [33]:
amazon = pd.read_fwf('amazon_cells_labelled.txt',header=None)
amazon.columns = ['Review']
print(len(amazon))
amazon ['subject'] = 'item'
amazon.head()

1000


Unnamed: 0,Review,subject
0,So there is no way for me to plug it in here i...,item
1,"Good case, Excellent value.\t1",item
2,Great for the jawbone.\t1,item
3,Tied to charger for conversations lasting more...,item
4,The mic is great.\t1,item


In [34]:
imdb = pd.read_fwf('imdb_labelled.txt',header=None)
imdb.columns = ['Review','0','1','2','3']
imdb = imdb.drop(['0','1','2','3'],axis=1)
imdb ['subject'] = 'movie'
print(len(imdb))
imdb.head()

1000


Unnamed: 0,Review,subject
0,"A very, very, very slow-moving, aimless movie ...",movie
1,Not sure who was more lost - the flat characte...,movie
2,Attempting artiness with black & white and cle...,movie
3,Very little music or anything to speak of. \t0,movie
4,The best scene in the movie was when Gerardo i...,movie


In [35]:
yelp = pd.read_fwf('yelp_labelled.txt',header=None)
yelp.columns = ['Review','1','2']
yelp = yelp.drop(['1','2'],axis=1)
yelp ['subject'] = 'restaurant'
print(len(yelp))
yelp.head()

1000


Unnamed: 0,Review,subject
0,Wow... Loved this place.\t1,restaurant
1,Crust is not good.\t0,restaurant
2,Not tasty and the texture was just nasty.\t0,restaurant
3,Stopped by during the late May bank holiday of...,restaurant
4,The selection on the menu was great and so wer...,restaurant


In [36]:
frames = [amazon, yelp, imdb]
df = pd.concat(frames,ignore_index=True)
df = df[:-1]
len(df)

2999

In [37]:
df.Review[-1:]

2998    Exceptionally bad!  \t0
Name: Review, dtype: object

In [43]:
def create_word_features(review):
    lower_review = review.lower()
    words = wordpunct_tokenize(lower_review)
    useful_words = [w for w in words 
                    if w not in string.punctuation 
                    and w not in stopwords.words('english')]
    useful_words = useful_words[:-1]
    return useful_words

def create_all_words(review):
    lower_review = review.lower()
    words = wordpunct_tokenize(lower_review)
    all_words = [w for w in words 
                    if w not in string.punctuation]
    all_words = all_words[:-1]
    return all_words

def move_sentiment(number):
    words = wordpunct_tokenize(number)
    return words[-1]
df ['sentiment'] = df.Review.apply(move_sentiment)
df ['obj_words'] = df.Review.apply(create_word_features)
df ['all_words'] = df.Review.apply(create_all_words)

In [44]:
df = df.loc[df.sentiment.isin(['1','0'])]
df.sentiment = df.sentiment.astype(int)

##### Need to find correlation between group of words and sentiment

In [98]:
df ['obj_word_count'] = df.obj_words.apply(lambda x: len(x))
df ['total_word_count'] = df.all_words.apply(lambda x: len(x))
df.head()

Unnamed: 0,Review,subject,sentiment,obj_words,all_words,obj_word_count,total_word_count
0,So there is no way for me to plug it in here i...,item,0,"[way, plug, us, unless, go, converter]","[so, there, is, no, way, for, me, to, plug, it...",6,21
1,"Good case, Excellent value.\t1",item,1,"[good, case, excellent, value]","[good, case, excellent, value]",4,4
2,Great for the jawbone.\t1,item,1,"[great, jawbone]","[great, for, the, jawbone]",2,4
3,Tied to charger for conversations lasting more...,item,0,"[tied, charger, conversations, lasting, 45, mi...","[tied, to, charger, for, conversations, lastin...",9,13
4,The mic is great.\t1,item,1,"[mic, great]","[the, mic, is, great]",2,4


In [105]:
import nltk, string, numpy

stemmer = nltk.stem.porter.PorterStemmer()
def StemTokens(tokens):
    return [stemmer.stem(token) for token in tokens]
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

df['lem_words'] = df.obj_words.apply(LemTokens)
df['stem_words'] = df.obj_words.apply(StemTokens)

In [106]:
df.head()

Unnamed: 0,Review,subject,sentiment,obj_words,all_words,obj_word_count,total_word_count,lem_words,stem_words
0,So there is no way for me to plug it in here i...,item,0,"[way, plug, us, unless, go, converter]","[so, there, is, no, way, for, me, to, plug, it...",6,21,"[way, plug, u, unless, go, converter]","[way, plug, us, unless, go, convert]"
1,"Good case, Excellent value.\t1",item,1,"[good, case, excellent, value]","[good, case, excellent, value]",4,4,"[good, case, excellent, value]","[good, case, excel, valu]"
2,Great for the jawbone.\t1,item,1,"[great, jawbone]","[great, for, the, jawbone]",2,4,"[great, jawbone]","[great, jawbon]"
3,Tied to charger for conversations lasting more...,item,0,"[tied, charger, conversations, lasting, 45, mi...","[tied, to, charger, for, conversations, lastin...",9,13,"[tied, charger, conversation, lasting, 45, min...","[tie, charger, convers, last, 45, minut, major..."
4,The mic is great.\t1,item,1,"[mic, great]","[the, mic, is, great]",2,4,"[mic, great]","[mic, great]"


In [107]:
good_reviews = df.loc[df.sentiment == 1]
bad_reviews = df.loc[df.sentiment == 0]

In [113]:
good_word_bank = []
for row in good_reviews.stem_words:
    for word in row:
        good_word_bank.append(word)

bad_word_bank = []
for row in bad_reviews.stem_words:
    for word in row:
        bad_word_bank.append(word)
good_freq = FreqDist(good_word_bank)
bad_freq = FreqDist(bad_word_bank)
good_freq.most_common(10)

[('great', 197),
 ('good', 172),
 ('film', 114),
 ('phone', 93),
 ('movi', 93),
 ('work', 83),
 ('love', 82),
 ('one', 71),
 ('like', 68),
 ('well', 61)]

In [114]:
df.obj_words.head()

0               [way, plug, us, unless, go, converter]
1                       [good, case, excellent, value]
2                                     [great, jawbone]
3    [tied, charger, conversations, lasting, 45, mi...
4                                         [mic, great]
Name: obj_words, dtype: object

In [117]:
def create_word_features (words):
    my_dict = dict([(word, True) for word in words])
    return my_dict

df['word_list'] = df.obj_words.apply(create_word_features)

df.head()

Unnamed: 0,Review,subject,sentiment,obj_words,all_words,obj_word_count,total_word_count,lem_words,stem_words,word_list
0,So there is no way for me to plug it in here i...,item,0,"[way, plug, us, unless, go, converter]","[so, there, is, no, way, for, me, to, plug, it...",6,21,"[way, plug, u, unless, go, converter]","[way, plug, us, unless, go, convert]","{'way': True, 'plug': True, 'us': True, 'unles..."
1,"Good case, Excellent value.\t1",item,1,"[good, case, excellent, value]","[good, case, excellent, value]",4,4,"[good, case, excellent, value]","[good, case, excel, valu]","{'good': True, 'case': True, 'excellent': True..."
2,Great for the jawbone.\t1,item,1,"[great, jawbone]","[great, for, the, jawbone]",2,4,"[great, jawbone]","[great, jawbon]","{'great': True, 'jawbone': True}"
3,Tied to charger for conversations lasting more...,item,0,"[tied, charger, conversations, lasting, 45, mi...","[tied, to, charger, for, conversations, lastin...",9,13,"[tied, charger, conversation, lasting, 45, min...","[tie, charger, convers, last, 45, minut, major...","{'tied': True, 'charger': True, 'conversations..."
4,The mic is great.\t1,item,1,"[mic, great]","[the, mic, is, great]",2,4,"[mic, great]","[mic, great]","{'mic': True, 'great': True}"


In [176]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Review,df.sentiment,test_size = .33)

In [183]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(X_test)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

In [186]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf,y_train)
clf.score(x_test_tfidf,y_test)

0.80888429752066116

### Pipelines

In [224]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score,ShuffleSplit

text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf', MultinomialNB()),])
cv = ShuffleSplit(n_splits=3,test_size = .3)
cross_val_score(text_clf,X_train,y_train,cv=cv)

array([ 0.80475382,  0.79626486,  0.77079796])

In [226]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge',
                                          alpha=1e-3,random_state=42))])
cross_val_score(text_clf,X_train,y_train,cv=cv)

array([ 0.80135823,  0.81833616,  0.79966044])

In [227]:
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf', KNeighborsClassifier())])
cross_val_score(text_clf,X_train,y_train,cv=cv)

array([ 0.72495756,  0.77589134,  0.77928693])

In [228]:
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf', LogisticRegression())])
cross_val_score(text_clf,X_train,y_train,cv=cv)

array([ 0.80814941,  0.82342954,  0.81154499])

In [229]:
from sklearn.tree import DecisionTreeClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf', DecisionTreeClassifier())])
cross_val_score(text_clf,X_train,y_train,cv=cv)

array([ 0.70628183,  0.73514431,  0.69439728])

In [230]:
from sklearn.svm import SVC
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf', SVC())])
cross_val_score(text_clf,X_train,y_train,cv=cv)

array([ 0.47198642,  0.69269949,  0.67402377])