

# Import Datasets



In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
#import yelp document
yelp=pd.read_csv('/content/25152808-yelp-labelled.txt',sep='\t',header=None)

In [62]:
#data exploration
yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [63]:
#Assign the column names
column_name=['Review','Sentiment']
yelp.columns=column_name

In [64]:
yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [65]:
yelp.shape

(1000, 2)

In [66]:
#import Amazon Dataset
Amazon=pd.read_csv('/content/25152800-amazon-cells-labelled.txt',sep='\t',header=None)
Amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [67]:
Amazon.columns=column_name

In [68]:
Amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [69]:
Amazon.shape

(1000, 2)

In [70]:
#import imdb dataset
imdb=pd.read_csv('/content/25152804-imdb-labelled.txt',sep='\t',header=None)
imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [71]:
imdb.columns=column_name

In [72]:
imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [73]:
imdb.shape

(748, 2)

In [74]:
#dataset append into one file
data=yelp.append([Amazon,imdb],ignore_index=True)

In [75]:
data.shape

(2748, 2)

In [76]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [77]:
#check the sentiment distribution
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [78]:
#check the missing value
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [79]:
#set the variables
x=data['Review']
y=data['Sentiment']

# Data Cleaning


1.   Remove the stopping words
2.   Lemmization Application



# Create a cleaning data function

In [80]:
#get the stored punctuation list
import string
punct=string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [81]:
#get a list of stop words
from spacy.lang.en.stop_words import STOP_WORDS
stopwords=list(STOP_WORDS)

In [92]:
import spacy
nlp=spacy.load('en_core_web_sm')

#creating a function for data cleaning
def text_data_cleaning(sentence): #data cleaning for one sentence
  doc=nlp(sentence) #load our spacy model

  tokens=[] #list of tokens
  for token in doc:                #Lemmatization is the grouping together of different forms of the same word
    if token.lemma_ !="-PRON-":  #if the token is proper noun, we directly take lower form,because there is no lemma; If it isn't a noun, it will convert into a lower form
      temp=token.lemma_.lower().strip() 
    else:  
      temp=token.lower_
    tokens.append(temp)

  cleaned_tokens=[]
  for token in tokens:
    if token not in stopwords and token not in punct:  #remove the stop words and punctuation in token
      cleaned_tokens.append(token)
  return cleaned_tokens

In [93]:
text_data_cleaning("Hello all, It's a beautiful day outside there!")

['hello', 'beautiful', 'day', 'outside']

# Model Building

In [94]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [95]:
tfidf=TfidfVectorizer(tokenizer=text_data_cleaning) #feature encoding technique

In [96]:
classifer=LinearSVC()

In [97]:
#model training
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
X_train.shape ,X_test.shape

((2198,), (550,))

In [98]:
X_train.head()

2572    An Italian reviewer called this "a small, grea...
526                          And it was way to expensive.
1509    As an earlier review noted, plug in this charg...
144     Nice blanket of moz over top but i feel like t...
2483    The film gives meaning to the phrase, "Never i...
Name: Review, dtype: object

In [99]:
#model fitting,create a pipeline object
clf=Pipeline([('tfidf',tfidf),('clf',classifer)])

In [100]:
clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7f2f75199dd0>)),
                ('clf', LinearSVC())])

In [101]:
#predictions
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [103]:
y_pred=clf.predict(X_test)

In [104]:
#confusion matrix
confusion_matrix(y_test,y_pred)

array([[200,  79],
       [ 51, 220]])

In [105]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.72      0.75       279
           1       0.74      0.81      0.77       271

    accuracy                           0.76       550
   macro avg       0.77      0.76      0.76       550
weighted avg       0.77      0.76      0.76       550



In [106]:
accuracy_score(y_test,y_pred)

0.7636363636363637

In [107]:
clf.predict(['Wow, I am learning Natural Language Processing in fun fashion!']) #sentiment is positive

array([1])