In [70]:
import numpy as np
import pandas as pd
import spacy #for nltk
import string # for text
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer # for Word2vec
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC # since its classification, could also use logistic also and other classification algotihm
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [3]:
dataset = pd.read_csv("/Users/syedsaifullahtarique/Desktop/Work/Nados-Pepcoding/NLP/sentiment-analysis-on-movie-reviews/IMDB Dataset.csv")

In [4]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
dataset.shape

(50000, 2)

In [7]:
# Creating an object 
nlp = English()
#Creating list of stopwords
stopwords = list(STOP_WORDS)
# Creating list of punctuations
punctuations = string.punctuation


In [8]:
# tokenizer
def tokenizer(sentence):
    mytokens = nlp(sentence)
    # Note that spaCy uses '-PRON-' as lemma for all personal pronouns lkike me, I etc
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

- We will create custom transformer to clean the tokenized data
- We will create a custom predictors class which inherits the TransformerMixin class. This class overrides the transform, fit and get_parrams methods. We’ll also create a clean_text() function that removes spaces and converts text into lowercase.
- In object-oriented programming languages, a mixin (or mix-in) is a class that contains methods for use by other classes without having to be the parent class of those other classes.

In [14]:
# Transformation (basically cleaning the text)  lower the text but not convert US to us
class predictors(TransformerMixin):
    def transform(self,X, **transform_params):
         return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep = True):
        return()
# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [11]:
vectorizer = CountVectorizer(tokenizer = tokenizer,ngram_range=(1,1))
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

In [72]:
# label encode y
le = preprocessing.LabelEncoder()
dataset['sentiment'] = le.fit_transform(dataset['sentiment'])
#dataset.rename({'positive':1,'negative':0},inplace = True)
dataset['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [74]:
#split dataset
X = dataset['review']
y = dataset['sentiment']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [75]:
#pipeline
SVCclassifier = LinearSVC()
SVCmodel = Pipeline([('cleaner',predictors()),
                     ('vectorizer',vectorizer),
                    ('classifier',SVCclassifier)])

In [76]:
# train
SVCmodel.fit(X_train,y_train)
SVCpre = SVCmodel.predict(X_test)



In [78]:
#evaluation
confusion_matrix(y_test,SVCpre)

array([[4271,  690],
       [ 659, 4380]])

In [79]:
accuracy_score(y_test,SVCpre)

0.8651

In [80]:
print(classification_report(y_test,SVCpre))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      4961
           1       0.86      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [85]:
pre = SVCmodel.predict(['It was very good'])
print("prediction: ",pre[0])

prediction:  1


In [86]:
pre = SVCmodel.predict(['It was horrible.. wanted to leave in the middle'])
print("prediction: ",pre[0])

prediction:  0
