TRAINING A MODEL FOR SENTIMENT ANALYSIS ON IMDB DATASET USING:
KNN
LOGISTIC REGRESSION
DECISION TREE CLASSIFIER
RANDOM FOREST CLASSIFIER AND
RECURRENT NUERAL NETWORK 

In [1]:
#Extracting data from zipped file and reading it
import tarfile
import os

#specifying file_path
file_path = 'Dataset.tar.gz'

# Open and extract the .tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path="extracted_files")  # Extract files to the directory

# Check the contents of the extracted folder
extracted_files = os.listdir("extracted_files/aclImdb")
print(extracted_files[:10])  # Prints only the first 10 entries

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']


In [2]:
#Converting the training dataset into a pandas dataset
import pandas as pd
import os

# Function to read reviews and labels from a directory
def load_data(directory, label):
    reviews = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return pd.DataFrame({'review': reviews, 'sentiment': label})

# Load positive and negative reviews for training
train_pos = load_data('extracted_files/aclImdb/train/pos', label=1)
train_neg = load_data('extracted_files/aclImdb/train/neg', label=0)

# Combine into one DataFrame
train_df = pd.concat([train_pos, train_neg], ignore_index=True)

# Display the first few rows
print(train_df.head())


                                              review  sentiment
0  Bromwell High is a cartoon comedy. It ran at t...          1
1  Homelessness (or Houselessness as George Carli...          1
2  Brilliant over-acting by Lesley Ann Warren. Be...          1
3  This is easily the most underrated film inn th...          1
4  This is not the typical Mel Brooks film. It wa...          1


In [3]:
#Shuffling the training dataset
train_df = train_df.sample(frac = 1, random_state = 42).reset_index(drop = True)
print(train_df.head())

                                              review  sentiment
0  In Panic In The Streets Richard Widmark plays ...          1
1  If you ask me the first one was really better ...          0
2  I am a big fan a Faerie Tale Theatre and I've ...          1
3  I just finished reading a book about Dillinger...          0
4  Greg Davis and Bryan Daly take some crazed sta...          0


In [4]:
#Converting the test dataset into a pandas dataset
import pandas as pd
import os

# Function to read reviews and labels from a directory
def load_data(directory, label):
    reviews = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return pd.DataFrame({'review': reviews, 'sentiment': label})

# Load positive and negative reviews for training
test_pos = load_data('extracted_files/aclImdb/test/pos', label=1)
test_neg = load_data('extracted_files/aclImdb/test/neg', label=0)

# Combine into one DataFrame
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

# Display the first few rows
print(test_df.head())

                                              review  sentiment
0  I went and saw this movie last night after bei...          1
1  Actor turned director Bill Paxton follows up h...          1
2  As a recreational golfer with some knowledge o...          1
3  I saw this film in a sneak preview, and it is ...          1
4  Bill Paxton has taken the true story of the 19...          1


In [5]:
#Shuffling test dataset
test_df = test_df.sample(frac = 1, random_state = 42).reset_index(drop = True)

print(test_df.head())

                                              review  sentiment
0  When I was a kid, I loved "Tiny Toons". I espe...          1
1  The setup for "Nature of the Beast" is ingenio...          0
2  I do not have much to say than this is a great...          1
3  Extremely formulaic with cosmic-sized logic ho...          0
4  I actually liked certain things about this gam...          0


In [6]:
#EDA on Training dataset
#checking the distributions
train_df["sentiment"].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

In [7]:
#checking the missing values
print(train_df.isna().sum())

review       0
sentiment    0
dtype: int64


DATA PREPROCESSING AND TOKENIZATION, LEMMATIZATION

In [8]:
import spacy
import re
import nltk
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS

#load english model for spacy
nlp = spacy.load("en_core_web_sm")

#get the stop words list from spacy
stop_words = list(STOP_WORDS)

#words to exclude from stop_words
excluding = ['against', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

#final stop_words list
final_stop_words = [word for word in stop_words if word not in excluding]

#initialzing the stemmer
snow = SnowballStemmer('english')

#function for processing the text
def process_text(texts):
    final_text_list = []

    for sent in texts:
        #set sent to empty if not a string
        if not isinstance(sent,str):
            final_text_list.append("")
            continue

        #basic processing steps before tokenization
        sent = sent.lower().strip()
        sent = re.sub(r'\s+', ' ', sent) # Multiple spaces and tabs into one
        sent = re.sub('<.*?>', '', sent) #remove html tags

        #applying spacy nlp pipeline to sent
        doc = nlp(sent)

        filtered_sentence = []

        for token in doc:
            if len(token.text)>2 and token.text not in final_stop_words and not token.is_digit:
                filtered_sentence.append(snow.stem(token.text))

        #join final string of cleaned sentences
        final_list = " ".join(filtered_sentence)
        final_text_list.append(final_list)

    return final_text_list

Train - validation split

In [9]:
#splitting training dataset into train and validation
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_df["review"],
                                                 train_df["sentiment"],
                                                 test_size = 0.10,
                                                 shuffle = True,
                                                 random_state = 324)

In [10]:
#checking the datatype of x_train
print(type(x_train))

<class 'pandas.core.series.Series'>


In [11]:
print(x_train.head())  # Print the first few rows
print(x_train.index)  # Print index if it's Series

15851    Police story brought Hong Kong movies to moder...
2202     Everyone we meet influences our thinking, modi...
9248     I think this film has been somewhat overrated ...
17845    This was great. When I saw the Japanese versio...
2872     This movie had no parts that were hilarious, m...
Name: review, dtype: object
Index([15851,  2202,  9248, 17845,  2872, 16882, 14786, 15533, 12138, 23059,
       ...
        3953,  2800, 19491, 18104, 14967, 17136, 22800,   600, 17292, 13044],
      dtype='int64', length=22500)


In [12]:
#calling the function to process the columns
print("processing the review column")
x_train = process_text(x_train.tolist())
x_val = process_text(x_val.tolist())

processing the review column


Pipeline Creation

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

#######PIPELINE########

pipeline = Pipeline([
    ('text_vect', CountVectorizer(binary = True,
                                 max_features = 15)),
    ('knn', KNeighborsClassifier())
])

#display pipeline
from sklearn import set_config
set_config(display = 'diagram')
pipeline

In [14]:
print(x_train[ :5])

['polic stori brought hong kong movi modern day cinema.jacki play policeman tri catch drug dealer time care young woman bad guy care relationship girlfriend selina brigitt lin).th movi featur plenti stunt not jacki actor jacki stunt club).thre jacki stunt member went hospit film film.th movi incred fight scene like car park fight shoppingm fight rank jacki finest.th movi won award best movi best action design jacki chan hong kong film awards.everyon love jacki chan and/or martial art movi shud', 'meet influenc think modifi way littl bit person rub eighth day take theme compar rainmak film harri daniel auteuil businessman expert sale psycholog meet georg pascal duquenn syndrom child run win perform actor film main strength open sequenc excel georg relat theori creation world close scene discov god creat eighth day moment stori frustrat harri exampl georg complet uninhibit demand pair expens shoe money kind scene laugh tear like scene syndrom group trip art galleri escap bus gate crash p

Fit The Pipeline

In [15]:
pipeline.fit(x_train, y_train.values)

Test the classifier on validation dataset

In [16]:
from sklearn.metrics import confusion_matrix,  classification_report, accuracy_score

#predicting on validation set
pred_values = pipeline.predict(x_val)

print(confusion_matrix(y_val.values, pred_values))
print(classification_report(y_val.values, pred_values))
print('Accuracy score:', accuracy_score(y_val.values, pred_values))

[[709 527]
 [583 681]]
              precision    recall  f1-score   support

           0       0.55      0.57      0.56      1236
           1       0.56      0.54      0.55      1264

    accuracy                           0.56      2500
   macro avg       0.56      0.56      0.56      2500
weighted avg       0.56      0.56      0.56      2500

Accuracy score: 0.556


Tuning the model using RandomSearchCV

In [17]:
from sklearn.model_selection import RandomizedSearchCV 
import numpy as np

#parameteres distributions
param_distributions = {
    'knn__n_neighbors': np.arange(3, 20, 2), #odd values for neighbors
    'knn__weights': ['uniform', 'distance'], #weighting methods
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'] #distance methods
}

#initializing the randomsearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter = 20, #no. of random combinations to try
    n_jobs = -1,
    random_state = 42,
    cv = 5, #5 fold cross validation
    scoring = 'accuracy'
)

In [18]:
#fitting the random_serach
random_search.fit(x_train, y_train)

In [19]:
#print the best parameters
print('Best parameters found:', random_search.best_params_)

#use the best model
best_model = random_search.best_estimator_

Best parameters found: {'knn__weights': 'uniform', 'knn__n_neighbors': 19, 'knn__metric': 'manhattan'}


In [20]:
#make predictions with val set with best model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

pred_values = best_model.predict(x_val)

print(confusion_matrix(y_val.values, pred_values))
print(classification_report(y_val.values, pred_values))
print('Accuracy_score:', accuracy_score(y_val.values, pred_values))

[[746 490]
 [566 698]]
              precision    recall  f1-score   support

           0       0.57      0.60      0.59      1236
           1       0.59      0.55      0.57      1264

    accuracy                           0.58      2500
   macro avg       0.58      0.58      0.58      2500
weighted avg       0.58      0.58      0.58      2500

Accuracy_score: 0.5776


In [21]:
#Using the knn model on test data
print(test_df)

                                                  review  sentiment
0      When I was a kid, I loved "Tiny Toons". I espe...          1
1      The setup for "Nature of the Beast" is ingenio...          0
2      I do not have much to say than this is a great...          1
3      Extremely formulaic with cosmic-sized logic ho...          0
4      I actually liked certain things about this gam...          0
...                                                  ...        ...
24995  Start with the premise that you will do anythi...          0
24996  This movie gives us some WWII history along wi...          1
24997  In my opinion this is the best Oliver Stone fl...          1
24998  It's certainly a direct-to-video, but the stor...          0
24999  This movie was obscenely obvious and predictab...          0

[25000 rows x 2 columns]


In [22]:
x_test = process_text(test_df["review"].tolist())
y_test = test_df["sentiment"].values

In [23]:
print(x_test[ :5])

['kid love tini toon especi love tini toon spent summer vacat thought laugh floor funni year later friend video figur watch good old day floor laugh opinion plucki hampton skit best decid happi world land end have crazi adventur skit funni look video tip write funniest cartoon seen.10/10', 'setup natur beast ingeni simpl fraught limitless potenti suspens harri salesman jack domest lanc henriksen pick troubl form hitchhik adrian eric robert possess incrimin inform against jack million dollar stolen casino dub hatchet man dismemb peopl desert sound great right sort like hitcher meet psycho men secret unfortun writer director victor salva jeeper creeper powder fame idea movi go scenario arous tension suspens poison suppos thriller inclus under homo erot tone place henriksen evok unusu not earthbound everyman sport ampl gut robert threaten scari extra death wish suck point actual care happen conclus slide improb territori kept think go fight club mistaken bad natur beast ... instead', 'not

In [24]:
#testing the best model on test data
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

test_predictions = best_model.predict(x_test)

print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print('Accuracy_score:', accuracy_score(y_test, test_predictions))

[[7719 4781]
 [5357 7143]]
              precision    recall  f1-score   support

           0       0.59      0.62      0.60     12500
           1       0.60      0.57      0.58     12500

    accuracy                           0.59     25000
   macro avg       0.59      0.59      0.59     25000
weighted avg       0.59      0.59      0.59     25000

Accuracy_score: 0.59448


In [25]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#creating pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('scaler', StandardScaler(with_mean = False)),
    ('classifier', LogisticRegression(random_state = 42))
])

#defining parameter_grid
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['lbfgs', 'liblinear'],
    'classifier__max_iter': [100,200,300],
}

#initializing the randomsearchCV
random_search_lr = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter = 6,
    n_jobs = -1,
    cv = 5,
    random_state = 42
)

#fit on train data
random_search_lr.fit(x_train, y_train)

#get the best model from RandomCV
best_model_lr = random_search_lr.best_estimator_

#predict on test data
pred_val_lr = best_model_lr.predict(x_test)

print(confusion_matrix(y_test, pred_val_lr))
print(classification_report(y_test, pred_val_lr))
print('Accuracy_score:', accuracy_score(y_test, pred_val_lr))

[[10307  2193]
 [ 3001  9499]]
              precision    recall  f1-score   support

           0       0.77      0.82      0.80     12500
           1       0.81      0.76      0.79     12500

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000

Accuracy_score: 0.79224


In [26]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#pipeline 
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', DecisionTreeClassifier(random_state = 42))
])

#defining paramater distributions
param_grid = {
    'classifier__max_depth': [None,10,20,30],
    'classifier__min_samples_split': [2,5,10],
    'classifier__min_samples_leaf': [1,2,4],
    'classifier__criterion': ['gini', 'entropy']
}

#initialize the random search cv
random_search_dts = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter = 10,
    n_jobs = -1,
    cv = 5,
    random_state = 42
)

#fit on train data
random_search_dts.fit(x_train, y_train)

#get the best model from random search cv
best_model_dts = random_search_dts.best_estimator_

#predict on test data
pred_val_dts = best_model_dts.predict(x_test)

print(confusion_matrix(y_test, pred_val_dts))
print(classification_report(y_test, pred_val_dts))
print('Accuracy_score:', accuracy_score(y_test, pred_val_dts))

[[8473 4027]
 [2562 9938]]
              precision    recall  f1-score   support

           0       0.77      0.68      0.72     12500
           1       0.71      0.80      0.75     12500

    accuracy                           0.74     25000
   macro avg       0.74      0.74      0.74     25000
weighted avg       0.74      0.74      0.74     25000

Accuracy_score: 0.73644


In [27]:
#Random Forest Classifier
#importing libraries for pipeline, model, tuning and evaluation
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Pipeline creation for RandomForest
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))  
])

# Defining param_grid
param_grid = {
    'classifier__n_estimators': [100,150,200],  # List format for single value
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_leaf': [5, 10]
}

# Apply RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_iter=10,
    random_state=42
)

# Fit on training data
random_search_rf.fit(x_train, y_train)

# Get the best model from RandomizedSearchCV
best_model_rf = random_search_rf.best_estimator_

# Test the best model on test data
pred_val_rf = best_model_rf.predict(x_test)

# Evaluate model performance
print(confusion_matrix(y_test, pred_val_rf)) 
print(classification_report(y_test, pred_val_rf))  
print('Accuracy_score:', accuracy_score(y_test, pred_val_rf))  

[[10264  2236]
 [ 1708 10792]]
              precision    recall  f1-score   support

           0       0.86      0.82      0.84     12500
           1       0.83      0.86      0.85     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

Accuracy_score: 0.84224


RECURRENT NEURAL NETWORK

In [28]:
#importing required libraries
import time
import pandas as pd
import numpy as np
import torch, torchtext
from collections import Counter
from torch import nn, optim
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [29]:
#reading the training dataset
train_df.head()

Unnamed: 0,review,sentiment
0,In Panic In The Streets Richard Widmark plays ...,1
1,If you ask me the first one was really better ...,0
2,I am a big fan a Faerie Tale Theatre and I've ...,1
3,I just finished reading a book about Dillinger...,0
4,Greg Davis and Bryan Daly take some crazed sta...,0


In [30]:
#EDA on train dataset
train_df["sentiment"].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

In [31]:
print(train_df.isna().sum()) #no missing values in the training dataset

review       0
sentiment    0
dtype: int64


In [32]:
#reading the testing dataset
test_df.head()

Unnamed: 0,review,sentiment
0,"When I was a kid, I loved ""Tiny Toons"". I espe...",1
1,"The setup for ""Nature of the Beast"" is ingenio...",0
2,I do not have much to say than this is a great...,1
3,Extremely formulaic with cosmic-sized logic ho...,0
4,I actually liked certain things about this gam...,0


In [33]:
#EDA on test data
test_df["sentiment"].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

In [34]:
print(test_df.isna().sum()) #no misisng values in the test dataset

review       0
sentiment    0
dtype: int64


SPLITTING DATA

In [35]:
#SPLITTING TRAINING DATA INTO TRAIN-VALIDATION
x_train, x_val, y_train, y_val = train_test_split(
    train_df["review"].tolist(),
    train_df["sentiment"].tolist(),
    test_size = 0.10,
    shuffle = True,
    random_state = 324
)

In [36]:
#test dataset
x_test = test_df["review"]
y_test = test_df["sentiment"]

TEXT PREPROCESSING

In [37]:
#TEXT PREPROCESSING & TRANSFORMATION
#creating a vocab 
tokenizer = get_tokenizer("basic_english") #using built-in tokenizer for lowercase, punctuation

#initializing counter for counting token frequency
counter = Counter()

#loop for updating token count frequency
for line in x_train:
    counter.update(tokenizer(line))

#creating vocab with words seen atleast 5 times
vocab = torchtext.vocab.vocab(counter, min_freq = 5)

#unknown tokens for handling unknown words
unk_token = '<unk>'
vocab.insert_token(unk_token, 0) #insert at index 0
vocab.set_default_index(0) #set default index at 0 for missing words

#padding token for handling different lenghts of text sequences during batching
pad_token = '<pad>'
vocab.insert_token(pad_token, 1) #insert at index 1

#checking with examples
print(f"'home' --> {vocab['home']}")
print(f"'bat' --> {vocab['bat']}")
print(f"'frfrrg' --> {vocab['frfrrg']}")
print(f"'instantiation' --> {vocab['instantiation']}")

'home' --> 1316
'bat' --> 4711
'frfrrg' --> 0
'instantiation' --> 0


TRANSFORMING DATA

In [38]:
#mapper for transforming text data
text_transform_pipeline = lambda x : [vocab[token] for token in tokenizer(x)]

In [39]:
#tranformation example on a portion of training data
x_train[21]

"Jackie Chan name is synonomus to stunts. This movie never let you down.The opening best chase scene and last roll down scene from the pole is so risky than one wonder ,if he knows the meaning of fear.This movie comes very close to Jackie's best which is PROJECT A.But the main difference being that PROJECT A contains three stars where as in this movie Jackie carries the film entirely on his shoulders.This is perhaps the main reason that this movie made jackie an biggest martial arts star followed by Bruce Lee.The film has nice comic touches too. What makes this film work is Jakie's ability to show his venerable side which his in contract to the typical martial arts action hero.This movie was followed by a sequel which was good but was quite tame in comparison to its predecessor."

In [40]:
print(f"Before Transforming:\t{x_train[21]}")
print(f"After Tranforming:\t{text_transform_pipeline(x_train[21])}")

Before Transforming:	Jackie Chan name is synonomus to stunts. This movie never let you down.The opening best chase scene and last roll down scene from the pole is so risky than one wonder ,if he knows the meaning of fear.This movie comes very close to Jackie's best which is PROJECT A.But the main difference being that PROJECT A contains three stars where as in this movie Jackie carries the film entirely on his shoulders.This is perhaps the main reason that this movie made jackie an biggest martial arts star followed by Bruce Lee.The film has nice comic touches too. What makes this film work is Jakie's ability to show his venerable side which his in contract to the typical martial arts action hero.This movie was followed by a sequel which was good but was quite tame in comparison to its predecessor.
After Tranforming:	[13, 91, 1583, 144, 0, 8, 49, 12, 99, 46, 726, 646, 174, 130, 12, 25, 142, 87, 1584, 152, 23, 387, 1585, 130, 152, 33, 25, 829, 144, 386, 1586, 405, 82, 1587, 36, 252, 317

In [41]:
#def a function for text transformation
def transformText(text_list, max_len):
    #transforms text wrt to max_len
    transformed_data = [text_transform_pipeline(text)[:max_len] for text in text_list]

    #for loop for padding text_list shorter than max_len
    for data in transformed_data:
        data[len(data):max_len] = np.ones(max_len - len(data))

    #returning the final tensor ready for medol input
    return torch.tensor(transformed_data, dtype = torch.int64)

In [42]:
#testing on a portion of training dataset with max_len = 50
text = x_train[5:7]
print(f"text:{text}\n")
print(f"num sentences:\t{len(text)}")
tt = transformText(text, max_len = 50)
print(f"tt:\n{tt}\n")
print(f"shape of tt:{tt.shape}")

text:["''The Sentinel'' is one of the best horror movies already made in the movie's Industry! I think it is very scary as very few movies actually are. Alison Parker is a model with some fame. She dates a lawyer called Michael Lerman, and has as a best friend, another model called Jennifer. Everything was great in her life, until she decides to live alone for some time and rents a beautiful and old apartment.<br /><br />The problem are her neighbors, who are very, VERY strange. Suddenly Alison starts to have health problems and faints with frequency; She also remembers some painful facts about her past that makes her have nightmares or illusions. But everything has a reason, and it has to do with the new house she is living...<br /><br />I personally find ''The Sentinel'' a very creepy movie, and along with ''The Exorcist'' they are two of the scariest movies I already watched. When we discover that Alison's house is only occupied by the priest and herself my blood froze! It's also ho

In [43]:
#creating tensor dataset, data loader
#with max_len = 100
max_len = 100
batch_size = 16
#for TRAIN - VAL DATASET
train_dataset = TensorDataset(transformText(x_train, max_len), torch.tensor(y_train))
#Data Loader for train data
train_loader = DataLoader(train_dataset, batch_size = batch_size)
#Tensor Dataset for Val data
val_dataset = TensorDataset(transformText(x_val, max_len), torch.tensor(y_val))
#Data Loader for val data
val_loader = DataLoader(val_dataset, batch_size = batch_size)


#For TEST DATASET
#Tensor dataset and Data Loader for test dataset
test_dataset = TensorDataset(transformText(x_test, max_len), torch.tensor(y_test, dtype = torch.int64))
test_loader = DataLoader(test_dataset, batch_size = batch_size)

LEVERAGING GloVe

In [44]:
#def embedding matrix by integrating pre-trained word embeddings(GloVe)
#for better performance instead of training word embeddings from scratch
glove = GloVe(name = '6B', dim = 300)
embedding_matrix = glove.get_vecs_by_tokens(vocab.get_itos())

MODEL CREATION

In [45]:
#defining hyperparameters
hidden_size = 8 #representaion of learned input passed between RNN layers
#basic hyperparameters
learning_rate = 0.001 
epochs = 25
#embedding vect and vocab size
embed_size = 300
vocab_size = len(vocab.get_itos())

In [46]:
#defining model architecture
#specifying model layers
#def class Net
class Net(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers = 1):
        super().__init__() #initializes the nn.Module, essential for defining layers
        #defining layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers=num_layers)
        self.linear = nn.Linear(hidden_size*max_len, 1)
        self.act = nn.Sigmoid()
    #def forward pass
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        #call RNN Layer
        outputs, _ = self.rnn(embeddings)
        #take outputs of each time step and send it to linear layer
        outs = self.linear(outputs.reshape(outputs.shape[0], -1))
        return self.act(outs)

#model Instantiation
model = Net(vocab_size, embed_size, hidden_size, num_layers = 2)

#initializing model weights
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)

    if isinstance(m, nn.RNN):
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])

In [47]:
#loading pre-trained GloVe embeddings into embedding layer
model.embedding.weight.data.copy_(embedding_matrix)

#freezing embedding layer so it remains unchanged during training
model.embedding.weight.requires_grad = False

TRAINING PHASE

In [48]:
#defining trainer(optimizer)
trainer = torch.optim.SGD(model.parameters(), lr = learning_rate)

#defining loss function
cross_ent_loss = nn.BCELoss(reduction = 'sum')

In [49]:
#starting the training process
#get the compute device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#applying initialized weights
model.apply(init_weights)
model.to(device)

#early stopping parameters
patience = 1 #num of epochs to wait for improvement
best_loss = float('inf') #lowest val loss
epochs_without_imporvement = 0

#outer loop for epoch
for epoch in range(epochs):
    start = time.time() #for tracking time from start
    #for tracking training and val loss
    training_loss = 0
    val_loss = 0
    
    #inner loop, where model is actually trained
    for data, target in train_loader:
        trainer.zero_grad() #resets gradients from previous step
        data = data.to(device) #move data to appropriate device
        target = target.to(device) #move target label to appropiate device
        output = model(data) #passes data thru model 
        L = cross_ent_loss(output.squeeze(1).float(), target.float()) #calculates BCELoss
        training_loss += L.item() #accumulates training loss
        L.backward() #backward propagation for calculating gradient
        trainer.step() #updates model params using optimizer

    #loop for validating data, no training(no weight updates)
    for data, target in val_loader:
        val_predictions = model(data.to(device)) #passes data for val_pred
        L = cross_ent_loss(val_predictions.squeeze(1).float(), target.to(device).float()) #calculates BCELoss 
        val_loss += L.item() #accumulates val loss

    #avg of losses
    training_loss = training_loss / len(y_train)
    val_loss = val_loss / len(y_val)

    #check for val loss improvement
    if val_loss < best_loss:
        best_loss = val_loss #replacing val loss as new best loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), 'best_model.pth') #save the best model
        print(f"Epoch{epoch}: Validation loss improved to {val_loss:.4f}")

    else:
        epochs_without_improvement += 1
        print(f"Epoch{epoch}: No improvement in Validation loss. Count:{epochs_without_improvement}")

    #check for early stopping
    if epochs_without_improvement >= patience:
        print(f"Early stopping triggered at epoch{epoch}")
        break
        
    end = time.time() #for tracking the end time

    #printing epoch number, training loss, val loss and time taken to complete one epoch
    print(f"Epoch:{epoch}. Training loss: {training_loss}. Validation loss: {val_loss}. Time taken: {start-end}")

Epoch0: Validation loss improved to 0.6327
Epoch:0. Training loss: 0.6997358768781026. Validation loss: 0.6327233093738556. Time taken: -12.285032033920288
Epoch1: Validation loss improved to 0.5628
Epoch:1. Training loss: 0.6104613204850091. Validation loss: 0.5628359375953674. Time taken: -12.444028377532959
Epoch2: Validation loss improved to 0.5407
Epoch:2. Training loss: 0.5654280639754401. Validation loss: 0.540670844078064. Time taken: -12.119035959243774
Epoch3: Validation loss improved to 0.5300
Epoch:3. Training loss: 0.5422936745219761. Validation loss: 0.5300084643363953. Time taken: -12.085025310516357
Epoch4: Validation loss improved to 0.5246
Epoch:4. Training loss: 0.5272447934097714. Validation loss: 0.5246472107887268. Time taken: -12.30503797531128
Epoch5: Validation loss improved to 0.5202
Epoch:5. Training loss: 0.5161007715119256. Validation loss: 0.5201531267166137. Time taken: -12.187032699584961
Epoch6: Validation loss improved to 0.5161
Epoch:6. Training loss:

PREDICTIONS ON VALIDATION DATA

In [50]:
val_predictions = [] #initializes an empty list
for data, target in val_loader:
    val_preds = model(data.to(device))
    val_predictions.extend(
        np.rint(val_pred)[0] for val_pred in val_preds.detach().cpu().numpy())
#checking the predictions
print(val_predictions[:10])

[0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0]


In [51]:
#predictions on validation dataset
print(confusion_matrix(y_val, val_predictions))
print(classification_report(y_val, val_predictions))
print('Accuracy_score:', accuracy_score(y_val, val_predictions))

[[969 267]
 [357 907]]
              precision    recall  f1-score   support

           0       0.73      0.78      0.76      1236
           1       0.77      0.72      0.74      1264

    accuracy                           0.75      2500
   macro avg       0.75      0.75      0.75      2500
weighted avg       0.75      0.75      0.75      2500

Accuracy_score: 0.7504


PREDICTIONS ON TEST DATA

In [52]:
test_predictions = [] #initializing an empty list
for data, target in test_loader:
    test_preds = model(data.to(device))
    test_predictions.extend(
        np.rint(test_pred)[0] for test_pred in test_preds.detach().cpu().numpy())
#checking the predictions
print(test_predictions[:10])

[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]


In [53]:
#make predictions on test data
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print('Accuracy_score:', accuracy_score(y_test, test_predictions))

[[9903 2597]
 [3758 8742]]
              precision    recall  f1-score   support

           0       0.72      0.79      0.76     12500
           1       0.77      0.70      0.73     12500

    accuracy                           0.75     25000
   macro avg       0.75      0.75      0.75     25000
weighted avg       0.75      0.75      0.75     25000

Accuracy_score: 0.7458
