In [2]:
import spacy
import  pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
data_yelp = pd.read_table('yelp_labelled.txt')
data_amazon = pd.read_table('amazon_cells_labelled.txt')
data_imdb = pd.read_table('imdb_labelled.txt')

# Joining the tables
combined_col= [data_amazon,data_imdb,data_yelp]

# To observe how the data in each individual dataset is structured
print(data_amazon.columns)

Index(['So there is no way for me to plug it in here in the US unless I go by a converter.', '0'], dtype='object')


In [51]:
# In order to add headers for columns in each dataset

for colname in combined_col:
    colname.columns = ["Review","Label"]
for colname in combined_col:
    print(colname.columns)

Index(['Review', 'Label'], dtype='object')
Index(['Review', 'Label'], dtype='object')
Index(['Review', 'Label'], dtype='object')


In [52]:
# In order to recognize which dataset belonged to which company, a 'Company' column is added as a key

company = [ "Amazon", "imdb", "yelp"]

comb_data = pd.concat(combined_col,keys = company)

In [53]:
# Exploring the  structure of  the new  data  frame

print(comb_data.shape)

comb_data.head()

(2745, 2)


Unnamed: 0,Unnamed: 1,Review,Label
Amazon,0,"Good case, Excellent value.",1
Amazon,1,Great for the jawbone.,1
Amazon,2,Tied to charger for conversations lasting more...,0
Amazon,3,The mic is great.,1
Amazon,4,I have to jiggle the plug to get it to line up...,0


In [54]:
comb_data.to_csv("Sentiment_Analysis_Dataset")

print(comb_data.columns)

print(comb_data.isnull().sum())

Index(['Review', 'Label'], dtype='object')
Review    0
Label     0
dtype: int64


In [55]:
import spacy
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core_web_sm')

# To build a list of stop words for filtering
stopwords = list(STOP_WORDS)
print(stopwords)

['there', 'regarding', 'noone', 'towards', 'where', 'both', 'between', 'who', "'d", '’ll', 'unless', 'fifteen', 'when', 'least', 'thereafter', 'whereas', 'used', 'give', 'somehow', 'above', 'that', 'which', 'made', 'until', 'must', 'no', 'done', 'more', 'twelve', 'formerly', 'ever', 'indeed', 'always', 'during', '’re', 'but', 'why', 'cannot', 'or', 'those', 'the', 'several', "'re", 'eight', 'has', 'thereupon', 'n‘t', 'through', 'yet', 'could', 'am', 'either', 'whole', 'not', 'own', 'nine', 'seem', 'per', 'often', 'nobody', 'also', 'myself', 'while', 'forty', 'out', 'over', 'eleven', 'among', 'third', '’m', 'thus', 'itself', 'what', 'anyhow', '‘ll', 'otherwise', 'already', 'becomes', 'yourself', 'too', 'yours', '’s', 'therefore', 'else', 'since', 'once', 'together', 'an', 'most', 'on', 'move', 'these', 'only', 'almost', 'he', 'anywhere', 'empty', 'about', 'due', 'whence', 'ourselves', 'say', 'will', 'whereupon', 'ca', 'toward', 'himself', 'become', 'same', 'been', 'whether', 'anyone', '

In [56]:
import string
punctuations = string.punctuation
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [57]:
def my_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [58]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [59]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [60]:
# Vectorization
vectorizer = CountVectorizer(tokenizer=my_tokenizer,ngram_range=(1,1)) 
classifier = LinearSVC()

In [61]:
tfvectorizer = TfidfVectorizer(tokenizer=my_tokenizer)

In [62]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [63]:
# Features and Labels
X = comb_data['Review']
ylabels = comb_data['Label']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [64]:
# Create the  pipeline to clean, tokenize, vectorize, and classify using"Count Vectorizor"
pipe_countvect = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])
# Fit our data
pipe_countvect.fit(X_train,y_train)
# Predicting with a test dataset
sample_prediction = pipe_countvect.predict(X_test)

# Prediction Results
# 1 = Positive review
# 0 = Negative review
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)
    
# Accuracy
print("Accuracy: ",pipe_countvect.score(X_test,y_test))
print("Accuracy: ",pipe_countvect.score(X_test,sample_prediction))
# Accuracy
print("Accuracy: ",pipe_countvect.score(X_train,y_train))

Disappointment.. I hate anything that goes in my ear. Prediction=> 0
It is a true classic.   Prediction=> 1
Great product. Prediction=> 1
This is a great restaurant at the Mandalay Bay. Prediction=> 1
It finds my cell phone right away when I enter the car. Prediction=> 0
It is simple to use and I like it. Prediction=> 1
Of all the dishes, the salmon was the best, but all were great. Prediction=> 1
I love the Pho and the spring rolls oh so yummy you have to try. Prediction=> 1
Their Research and Development division obviously knows what they're doing. Prediction=> 1
Still it's quite interesting and entertaining to follow.   Prediction=> 1
Very poor service. Prediction=> 0
Oh yeah, and the storyline was pathetic too.   Prediction=> 0
Strike 2, who wants to be rushed. Prediction=> 0
Every element of this story was so over the top, excessively phony and contrived that it was painful to sit through.   Prediction=> 0
The battery works great! Prediction=> 1
I am so tired of clichés that is ju

In [65]:
# Another random review
pipe_countvect.predict(["This was a great movie"])

array([1], dtype=int64)

In [66]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!"]

In [67]:
pipe_countvect.predict(example)

array([1, 0, 1], dtype=int64)

In [88]:
reviews = pd.read_csv('../csv/reviews_clean.csv').dropna()

In [89]:
reviews.isna().value_counts()

link   review
False  False     509758
dtype: int64

In [93]:
sentiment = pipe_countvect.predict(reviews['review'])

In [94]:
reviews['sentiment'] = sentiment

In [95]:
reviews.to_csv('../csv/reviews_clean.csv', index=False)