### 1) Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_yelp = pd.read_csv('yelp_labelled.txt',sep='\t',header=None)
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
columns = ['Review','Sentiment']
data_yelp.columns = columns

In [4]:
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
data_yelp.shape

(1000, 2)

In [6]:
#Amazon dataset
data_amazon = pd.read_csv('amazon_cells_labelled.txt',sep = '\t',header = None)
data_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [7]:
data_amazon.columns = columns

In [8]:
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [9]:
data_amazon.shape

(1000, 2)

In [10]:
#IMDB Dataset
data_imdb = pd.read_csv('imdb_labelled.txt',sep='\t',header=None)
data_imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [11]:
data_imdb.columns = columns
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [12]:
data_imdb.shape

(748, 2)

In [13]:
# combining all datsets
data = pd.concat([data_yelp,data_amazon,data_imdb],axis=0,ignore_index=True)

In [14]:
data.shape

(2748, 2)

In [15]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [16]:
# checking the distribution of sentiments
data['Sentiment'].value_counts() #data is almost balanced

Sentiment
1    1386
0    1362
Name: count, dtype: int64

In [17]:
# checking null values
data.isnull().sum() #there are no null values

Review       0
Sentiment    0
dtype: int64

In [18]:
# setting input output
X = data['Review']
y = data['Sentiment']

## Data cleaning with spacy

In [19]:
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS

In [20]:
stopwords = list(STOP_WORDS)
punct = string.punctuation

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
def text_cleaning(text):
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [23]:
text_cleaning("Hello all!! It's a beautiful day ")

['hello', 'beautiful', 'day']

### Vectorization (TF-IDF)

In [24]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [25]:
tfidf = TfidfVectorizer(tokenizer= text_cleaning)

In [26]:
classifier = LinearSVC()

## Split the Dataset

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

## Create a Pipleline and fit the training data

In [29]:
clf = Pipeline([('tfidf',tfidf),('classifier',classifier)])

In [30]:
clf.fit(X_train,y_train)



## Predict the test results

In [31]:
pred = clf.predict(X_test)

In [32]:
from sklearn .metrics import classification_report,confusion_matrix,accuracy_score

In [33]:
print("The accuracy score is:",accuracy_score(y_test,pred))
print('\n',"Classification report:", '\n',classification_report(y_test,pred),'\n')
print("Confusion Matrix:",'\n', confusion_matrix(y_test,pred))

The accuracy score is: 0.7763636363636364

 Classification report: 
               precision    recall  f1-score   support

           0       0.77      0.81      0.79       285
           1       0.78      0.74      0.76       265

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550
 

Confusion Matrix: 
 [[230  55]
 [ 68 197]]


In [34]:
clf.predict(['This is an interesting project']) # 1 indicates positive reciew

array([1], dtype=int64)

In [35]:
clf.predict(['The project is bad']) #0 indicates negative review

array([0], dtype=int64)