In [None]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/pep video/IMDB Dataset.csv')

In [None]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
dataset.shape

(50000, 2)

In [None]:
dataset.columns

Index(['review', 'sentiment'], dtype='object')

### Tokenizer

In [None]:
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [None]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

###Vectorization

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}


def clean_text(text):     
    return text.strip().lower()

In [None]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1)) 
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

### Split the Dataset

In [None]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

### LinearSVC

In [None]:
SVCclassifier = LinearSVC()
SVCmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', SVCclassifier)])


SVCmodel.fit(X_train,y_train)   
SVCpred = SVCmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,SVCpred)}')
print(f'Accuracy: {accuracy_score(y_test,SVCpred)*100}%')
print(f'\nClassification Report:\n{classification_report(y_test,SVCpred)}')





Confusion Matrix:
[[4330  641]
 [ 635 4394]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.87      0.87      4971
    positive       0.87      0.87      0.87      5029

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Accuracy: 87.24%


In [None]:
pre = SVCmodel.predict(["Selmon bhoi you rocked it ..loved the movie ..a treat for the audience"])
print('Prediction:'+pre[0])

Prediction:positive


In [None]:
pre = SVCmodel.predict(["All I wanted throughout the movie was refund !!! Why you do this Selmon Bhoi ??? The movie was awful"])
print('Prediction:'+pre[0])

Prediction:negative
