# Kaggle 
# Sentiment Analysis

### I will first look into a basic prediction model for review type (good or bad)
### Then I will look into developing a ***Subject Finder***
    - This will hopefully give a classification subject for each review (movie,restaurant,item)

In [117]:
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize,FreqDist

In [118]:
amazon = pd.read_fwf('amazon_cells_labelled.txt',header=None)
amazon.columns = ['Review']
print(len(amazon))
amazon ['subject'] = 'item'
amazon.head()

1000


Unnamed: 0,Review,subject
0,So there is no way for me to plug it in here i...,item
1,"Good case, Excellent value.\t1",item
2,Great for the jawbone.\t1,item
3,Tied to charger for conversations lasting more...,item
4,The mic is great.\t1,item


In [119]:
imdb = pd.read_fwf('imdb_labelled.txt',header=None)
imdb.columns = ['Review','0','1','2']
imdb = imdb.drop(['0','1','2'],axis=1)
imdb ['subject'] = 'movie'
print(len(imdb))
imdb.head()

1000


Unnamed: 0,Review,subject
0,"A very, very, very slow-moving, aimless movie ...",movie
1,Not sure who was more lost - the flat characte...,movie
2,Attempting artiness with black & white and cle...,movie
3,Very little music or anything to speak of. \t0,movie
4,The best scene in the movie was when Gerardo i...,movie


In [120]:
yelp = pd.read_fwf('yelp_labelled.txt',header=None)
yelp.columns = ['Review','1','2']
yelp = yelp.drop(['1','2'],axis=1)
yelp ['subject'] = 'restaurant'
print(len(yelp))
yelp.head()

1000


Unnamed: 0,Review,subject
0,Wow... Loved this place.\t1,restaurant
1,Crust is not good.\t0,restaurant
2,Not tasty and the texture was just nasty.\t0,restaurant
3,Stopped by during the late May bank holiday of...,restaurant
4,The selection on the menu was great and so wer...,restaurant


In [121]:
frames = [amazon, yelp, imdb]
df = pd.concat(frames,ignore_index=True)
df = df[:-1]
len(df)

2999

In [125]:
df

Unnamed: 0,Review,subject,sentiment,words
0,So there is no way for me to plug it in here i...,item,0,"[way, plug, us, unless, go, converter]"
1,"Good case, Excellent value.\t1",item,1,"[good, case, excellent, value]"
2,Great for the jawbone.\t1,item,1,"[great, jawbone]"
3,Tied to charger for conversations lasting more...,item,0,"[tied, charger, conversations, lasting, 45, mi..."
4,The mic is great.\t1,item,1,"[mic, great]"
5,I have to jiggle the plug to get it to line up...,item,0,"[jiggle, plug, get, line, right, get, decent, ..."
6,If you have several dozen or several hundred c...,item,0,"[several, dozen, several, hundred, contacts, i..."
7,If you are Razr owner...you must have this!\t1,item,1,"[razr, owner, ..., must]"
8,"Needless to say, I wasted my money.\t0",item,0,"[needless, say, wasted, money]"
9,What a waste of money and time!.\t0,item,0,"[waste, money, time, !.]"


In [122]:
df.Review[-1:]

2998    Exceptionally bad!  \t0
Name: Review, dtype: object

In [123]:
def create_word_features(review):
    lower_review = review.lower()
    words = wordpunct_tokenize(lower_review)
    useful_words = [w for w in words 
                    if w not in string.punctuation 
                    and w not in stopwords.words('english')]
    useful_words = useful_words[:-1]
    return useful_words

def move_sentiment(number):
    words = wordpunct_tokenize(number)
    return words[-1]
df ['sentiment'] = df.Review.apply(move_sentiment)
df ['words'] = df.Review.apply(create_word_features)

In [124]:
df = df.loc[df.sentiment.isin(['1','0'])]
df.sentiment = df.sentiment.astype(int)

In [92]:
good_reviews = df.loc[df.sentiment == 1]
bad_reviews = df.loc[df.sentiment == 0]
good_reviews.head()

Unnamed: 0,Review,sentiment,words
1,"Good case, Excellent value.\t1",1,"[good, case, excellent, value]"
2,Great for the jawbone.\t1,1,"[great, jawbone]"
4,The mic is great.\t1,1,"[mic, great]"
7,If you are Razr owner...you must have this!\t1,1,"[razr, owner, ..., must]"
10,And the sound quality is great.\t1,1,"[sound, quality, great]"


In [112]:
good_word_bank = []
for row in good_reviews.words:
    for word in row:
        good_word_bank.append(word)

bad_word_bank = []
for row in bad_reviews.words:
    for word in row:
        bad_word_bank.append(word)
print(len(good_word_bank))
len(bad_word_bank)

8859


8592

In [110]:
good_freq = FreqDist(good_word_bank)
print('Most Common words:')
good_freq.most_common(10)

Most Common words:


[('great', 197),
 ('good', 172),
 ('film', 99),
 ('phone', 88),
 ('movie', 78),
 ('one', 72),
 ('well', 61),
 ('best', 60),
 ('food', 58),
 ('really', 56)]

In [114]:
bad_freq = FreqDist(bad_word_bank)
print('Most Common words:')
bad_freq.most_common(50)

Most Common words:


[('bad', 95),
 ('movie', 91),
 ('phone', 80),
 ('one', 67),
 ('like', 63),
 ('time', 62),
 ('food', 61),
 ('would', 56),
 ('place', 55),
 ('service', 53),
 ('film', 53),
 ('good', 50),
 ('back', 48),
 ('really', 47),
 ('ever', 46),
 ('even', 45),
 ('work', 38),
 ('worst', 38),
 ('never', 37),
 ('...', 36),
 ('go', 35),
 ('could', 35),
 ('much', 33),
 ('get', 31),
 ('waste', 31),
 ('terrible', 30),
 ('disappointed', 29),
 ('also', 29),
 ('first', 28),
 ('quality', 28),
 ('way', 27),
 ('minutes', 27),
 ('money', 27),
 ('poor', 26),
 ('make', 26),
 ('battery', 24),
 ('use', 24),
 ('2', 24),
 ('acting', 24),
 ('better', 23),
 ('got', 23),
 ('plot', 23),
 ('product', 22),
 ('enough', 22),
 ('thing', 22),
 ('piece', 22),
 ('going', 22),
 ('think', 21),
 ('well', 21),
 ('made', 21)]

##### Need to find correlation between group of words and sentiment