# 1) Importing the datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')
data_imdb = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/BERT/IMDB-Dataset.csv')
import pandas as pd

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
data_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,"Taut and organically gripping, Edward Dmytryk'...",positive
2,"""Ardh Satya"" is one of the finest film ever ma...",positive
3,My first exposure to the Templarios & not a go...,positive
4,One of the most significant quotes from the en...,negative


In [None]:
data_imdb.shape

(9998, 2)

In [None]:
data_imdb['sentiment'].value_counts()

positive    5027
negative    4971
Name: sentiment, dtype: int64

In [None]:
# check for null values
data_imdb.isnull().sum()

# no null values in the data

review       0
sentiment    0
dtype: int64

In [None]:
x = data_imdb['review']
y = data_imdb['sentiment']

# 2) Data Cleaning

Removeing stopwords, punctuations
and apply lemmatization

## Create a function to clean the data

In [None]:
import string

In [None]:
punct = string.punctuation

In [None]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
stopwords = list(STOP_WORDS) # list of stopwords

In [None]:
# creating a function for data cleaning
def text_data_cleaning(sentence):
  doc = nlp(sentence)

  tokens = [] # list of tokens
  for token in doc:
    if token.lemma_ != "-PRON-":
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
 
  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punct:
      cleaned_tokens.append(token)
  return cleaned_tokens
  # if root form of that word is not pronoun then it is going to convert that into lower form
# and if that word is a proper noun, then we are directly taking lower form, because there is no lemma for proper noun

In [None]:
text_data_cleaning("Hello all, It's a beautiful day outside there!")
# stopwords and punctuations removed

['hello', 'beautiful', 'day', 'outside']

## Vectorization Feature Engineering (TF-IDF)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [None]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
# tokenizer=text_data_cleaning, tokenization will be done according to this function

In [None]:
classifier = LinearSVC()

# 3) Train the model

## Splitting the dataset into the Train and Test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
x_train.shape, x_test.shape
# 2198 samples in training dataset and 550 in test dataset

((7998,), (2000,))

In [None]:
x_train.head()

4755    Kristine Watts (Molie Weeks) is broken apart, ...
2994    Three sergeants in the British army stationed ...
5867    Historically accurate? Hmm... Perhaps... if yo...
356     I was truly looking forward to this title. It ...
49      The few scenes that actually attempt a depicti...
Name: review, dtype: object

## Fit the x_train and y_train

In [None]:
clf = Pipeline([('tfidf',tfidf), ('clf',classifier)])
# it will first do vectorization and then it will do classification

In [None]:
clf.fit(x_train, y_train)



# 4) Predict the Test set results

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred = clf.predict(x_test)

In [None]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[533, 441],
       [518, 508]])

In [None]:
# classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.51      0.55      0.53       974
    positive       0.54      0.50      0.51      1026

    accuracy                           0.52      2000
   macro avg       0.52      0.52      0.52      2000
weighted avg       0.52      0.52      0.52      2000



In [None]:
accuracy_score(y_test, y_pred)

0.5205

In [None]:
clf.predict(["Wow, this is a great movie"])

array(['positive'], dtype=object)

In [None]:
clf.predict(["It's hard to enjoy these things!"])

array(['negative'], dtype=object)