In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [11]:
from spacy.lang.en.stop_words import STOP_WORDS

In [12]:
stopwords = list(STOP_WORDS)

In [13]:
stopwords

['same',
 'must',
 'how',
 'most',
 'between',
 'from',
 'along',
 'otherwise',
 'where',
 'much',
 'mostly',
 'this',
 'though',
 'somehow',
 'never',
 'nothing',
 "'ll",
 'regarding',
 'over',
 'often',
 'whereas',
 'else',
 'am',
 'than',
 'everything',
 'every',
 'unless',
 'does',
 'himself',
 'someone',
 'whereby',
 'hundred',
 'herein',
 'during',
 'it',
 'among',
 'might',
 'becomes',
 'nowhere',
 'across',
 'various',
 'their',
 'ca',
 'around',
 'whom',
 'yet',
 'an',
 'hereby',
 'fifty',
 'thus',
 'ten',
 'moreover',
 'his',
 'not',
 'since',
 'became',
 'further',
 'whatever',
 'next',
 'two',
 'before',
 'rather',
 'about',
 'us',
 'and',
 'above',
 '‘ll',
 'done',
 'whoever',
 'if',
 'without',
 'are',
 'call',
 'into',
 'always',
 'enough',
 'those',
 '’m',
 'whose',
 'has',
 'get',
 'we',
 'n‘t',
 'eight',
 'such',
 'as',
 'nine',
 'ours',
 'at',
 'one',
 'twenty',
 'off',
 'had',
 'say',
 'even',
 'move',
 'itself',
 'although',
 'beside',
 'show',
 'whether',
 'whithe

In [14]:
len(stopwords)

326

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [17]:
data = pd.read_csv('IMDB Dataset.csv',header = None)

In [18]:
data.columns

Int64Index([0, 1], dtype='int64')

In [19]:
column_names = ['Review','Sentiment']
data.columns = column_names
data.head()

Unnamed: 0,Review,Sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
data.shape

(50000, 2)

In [21]:
data['Sentiment'].value_counts()

negative    25000
positive    25000
Name: Sentiment, dtype: int64

In [22]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [23]:
#Tokenization

In [24]:
import string

In [25]:
punc = string.punctuation

In [26]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [27]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower() .strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punc:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [28]:
from sklearn.svm import LinearSVC

In [29]:
#Vectorization and Classification
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC()

In [30]:
X = data['Review']
y = data['Sentiment']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [32]:
X_train.shape,X_test.shape

((40000,), (10000,))

In [33]:
clf = Pipeline([('tfidf',tfidf),('clf',classifier)])

In [34]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_data_cleaning at 0x0000019A077A0F78>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_inter

In [35]:
y_pred = clf.predict(X_test)

In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [38]:
confusion_matrix(y_test,y_pred)

array([[4381,  580],
       [ 490, 4549]], dtype=int64)

In [39]:
clf.predict(["Wow,this sucks"])

array(['negative'], dtype=object)

In [40]:
clf.predict(["Awesome movie"])

array(['positive'], dtype=object)

In [42]:
clf.predict(["This movie is the best"])

array(['positive'], dtype=object)