In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
data= pd.read_csv('https://raw.githubusercontent.com/laxmimerit/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset/master/datasets/imdb_labelled.txt', sep='\t', header = None)
data.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
columan_name = ['Review', 'Sentiment']
data.columns = columan_name

In [None]:
data.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
data.shape
# 1000 rows (reviews), 2 columns (Sentiments)

(748, 2)

In [None]:
data['Sentiment'].value_counts()

# 1346 positive reviews
# 1362 Negative reviews

1    386
0    362
Name: Sentiment, dtype: int64

In [None]:
# check for null values
data.isnull().sum()

# no null values in the data

Review       0
Sentiment    0
dtype: int64

In [None]:
x = data['Review']
y = data['Sentiment']

In [None]:
import string

In [None]:
punct = string.punctuation

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
stopwords = list(STOP_WORDS) # list of stopwords

In [None]:
def text_data_cleaning(sentence):
  doc = nlp(sentence)

  tokens = [] # list of tokens
  for token in doc:
    if token.lemma_ != "-PRON-":
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
 
  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punct:
      cleaned_tokens.append(token)
  return cleaned_tokens

In [None]:
text_data_cleaning("Hello all, It's a beautiful day outside there!")
# stopwords and punctuations removed

['hello', 'beautiful', 'day', 'outside']

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [None]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
# tokenizer=text_data_cleaning, tokenization will be done according to this function

In [None]:
classifier = LinearSVC()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
x_train.shape, x_test.shape
# 2198 samples in training dataset and 550 in test dataset

((598,), (150,))

In [None]:
x_train.head()

97                            I hate movies like that.  
516    The attractive set used throughout most of the...
156    The writers were "smack on" and I think the be...
395    But "Tiny Toons" kept the 90's vibe and delive...
732             She is as lovely as usual, this cutie!  
Name: Review, dtype: object

In [None]:
clf = Pipeline([('tfidf',tfidf), ('clf',classifier)])
# it will first do vectorization and then it will do classification

In [None]:
clf.fit(x_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7f95e771ddd0>)),
                ('clf', LinearSVC())])

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred = clf.predict(x_test)

In [None]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[58, 26],
       [ 6, 60]])

In [None]:
# classification_report
print(classification_report(y_test, y_pred))
# we are getting almost 77% accuracy

              precision    recall  f1-score   support

           0       0.91      0.69      0.78        84
           1       0.70      0.91      0.79        66

    accuracy                           0.79       150
   macro avg       0.80      0.80      0.79       150
weighted avg       0.81      0.79      0.79       150



In [None]:
accuracy_score(y_test, y_pred)
# 76% accuracy

0.7866666666666666

In [None]:
clf.predict(["Wow, intersesting movie!"])
# output is 1, that means review is positive

array([1])

In [None]:
clf.predict(["Worst quality"])
# output is 1, that means review is positive

array([0])