TRAINING A KNN MODEL FOR THE IMDB DATASET

In [1]:
#Extracting data from zipped file and reading it
import tarfile
import os

#specifying file_path
file_path = 'Dataset.tar.gz'

# Open and extract the .tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path="extracted_files")  # Extract files to the directory

# Check the contents of the extracted folder
extracted_files = os.listdir("extracted_files/aclImdb")
print(extracted_files[:10])  # Prints only the first 10 entries

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']


In [2]:
#Converting the training dataset into a pandas dataset
import pandas as pd
import os

# Function to read reviews and labels from a directory
def load_data(directory, label):
    reviews = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return pd.DataFrame({'review': reviews, 'sentiment': label})

# Load positive and negative reviews for training
train_pos = load_data('extracted_files/aclImdb/train/pos', label=1)
train_neg = load_data('extracted_files/aclImdb/train/neg', label=0)

# Combine into one DataFrame
train_df = pd.concat([train_pos, train_neg], ignore_index=True)

# Display the first few rows
print(train_df.head())


                                              review  sentiment
0  Bromwell High is a cartoon comedy. It ran at t...          1
1  Homelessness (or Houselessness as George Carli...          1
2  Brilliant over-acting by Lesley Ann Warren. Be...          1
3  This is easily the most underrated film inn th...          1
4  This is not the typical Mel Brooks film. It wa...          1


In [3]:
#Shuffling the training dataset
train_df = train_df.sample(frac = 1, random_state = 42).reset_index(drop = True)
print(train_df.head())

                                              review  sentiment
0  In Panic In The Streets Richard Widmark plays ...          1
1  If you ask me the first one was really better ...          0
2  I am a big fan a Faerie Tale Theatre and I've ...          1
3  I just finished reading a book about Dillinger...          0
4  Greg Davis and Bryan Daly take some crazed sta...          0


In [4]:
#Converting the test dataset into a pandas dataset
import pandas as pd
import os

# Function to read reviews and labels from a directory
def load_data(directory, label):
    reviews = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return pd.DataFrame({'review': reviews, 'sentiment': label})

# Load positive and negative reviews for training
test_pos = load_data('extracted_files/aclImdb/test/pos', label=1)
test_neg = load_data('extracted_files/aclImdb/test/neg', label=0)

# Combine into one DataFrame
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

# Display the first few rows
print(test_df.head())

                                              review  sentiment
0  I went and saw this movie last night after bei...          1
1  Actor turned director Bill Paxton follows up h...          1
2  As a recreational golfer with some knowledge o...          1
3  I saw this film in a sneak preview, and it is ...          1
4  Bill Paxton has taken the true story of the 19...          1


In [5]:
#Shuffling test dataset
test_df = test_df.sample(frac = 1, random_state = 42).reset_index(drop = True)

print(test_df.head())

                                              review  sentiment
0  When I was a kid, I loved "Tiny Toons". I espe...          1
1  The setup for "Nature of the Beast" is ingenio...          0
2  I do not have much to say than this is a great...          1
3  Extremely formulaic with cosmic-sized logic ho...          0
4  I actually liked certain things about this gam...          0


In [6]:
#EDA on Training dataset
#checking the distributions
train_df["sentiment"].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

In [7]:
#checking the missing values
print(train_df.isna().sum())

review       0
sentiment    0
dtype: int64


DATA PREPROCESSING AND TOKENIZATION, LEMMATIZATION

In [8]:
import spacy
import re
import nltk
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS

#load english model for spacy
nlp = spacy.load("en_core_web_sm")

#get the stop words list from spacy
stop_words = list(STOP_WORDS)

#words to exclude from stop_words
excluding = ['against', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

#final stop_words list
final_stop_words = [word for word in stop_words if word not in excluding]

#initialzing the stemmer
snow = SnowballStemmer('english')

#function for processing the text
def process_text(texts):
    final_text_list = []

    for sent in texts:
        #set sent to empty if not a string
        if not isinstance(sent,str):
            final_text_list.append("")
            continue

        #basic processing steps before tokenization
        sent = sent.lower().strip()
        sent = re.sub(r'\s+', ' ', sent) # Multiple spaces and tabs into one
        sent = re.sub('<.*?>', '', sent) #remove html tags

        #applying spacy nlp pipeline to sent
        doc = nlp(sent)

        filtered_sentence = []

        for token in doc:
            if len(token.text)>2 and token.text not in final_stop_words and not token.is_digit:
                filtered_sentence.append(snow.stem(token.text))

        #join final string of cleaned sentences
        final_list = " ".join(filtered_sentence)
        final_text_list.append(final_list)

    return final_text_list

Train - validation split

In [9]:
#splitting training dataset into train and validation
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_df["review"],
                                                 train_df["sentiment"],
                                                 test_size = 0.10,
                                                 shuffle = True,
                                                 random_state = 324)

In [10]:
#checking the datatype of x_train
print(type(x_train))

<class 'pandas.core.series.Series'>


In [11]:
print(x_train.head())  # Print the first few rows
print(x_train.index)  # Print index if it's Series

15851    Police story brought Hong Kong movies to moder...
2202     Everyone we meet influences our thinking, modi...
9248     I think this film has been somewhat overrated ...
17845    This was great. When I saw the Japanese versio...
2872     This movie had no parts that were hilarious, m...
Name: review, dtype: object
Index([15851,  2202,  9248, 17845,  2872, 16882, 14786, 15533, 12138, 23059,
       ...
        3953,  2800, 19491, 18104, 14967, 17136, 22800,   600, 17292, 13044],
      dtype='int64', length=22500)


In [12]:
#calling the function to process the columns
print("processing the review column")
x_train = process_text(x_train.tolist())
x_val = process_text(x_val.tolist())

processing the review column


Pipeline Creation

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

#######PIPELINE########

pipeline = Pipeline([
    ('text_vect', CountVectorizer(binary = True,
                                 max_features = 15)),
    ('knn', KNeighborsClassifier())
])

#display pipeline
from sklearn import set_config
set_config(display = 'diagram')
pipeline

In [14]:
print(x_train[ :5])

['polic stori brought hong kong movi modern day cinema.jacki play policeman tri catch drug dealer time care young woman bad guy care relationship girlfriend selina brigitt lin).th movi featur plenti stunt not jacki actor jacki stunt club).thre jacki stunt member went hospit film film.th movi incred fight scene like car park fight shoppingm fight rank jacki finest.th movi won award best movi best action design jacki chan hong kong film awards.everyon love jacki chan and/or martial art movi shud', 'meet influenc think modifi way littl bit person rub eighth day take theme compar rainmak film harri daniel auteuil businessman expert sale psycholog meet georg pascal duquenn syndrom child run win perform actor film main strength open sequenc excel georg relat theori creation world close scene discov god creat eighth day moment stori frustrat harri exampl georg complet uninhibit demand pair expens shoe money kind scene laugh tear like scene syndrom group trip art galleri escap bus gate crash p

Fit The Pipeline

In [15]:
pipeline.fit(x_train, y_train.values)

Test the classifier on validation dataset

In [16]:
from sklearn.metrics import confusion_matrix,  classification_report, accuracy_score

#predicting on validation set
pred_values = pipeline.predict(x_val)

print(confusion_matrix(y_val.values, pred_values))
print(classification_report(y_val.values, pred_values))
print('Accuracy score:', accuracy_score(y_val.values, pred_values))

[[696 540]
 [579 685]]
              precision    recall  f1-score   support

           0       0.55      0.56      0.55      1236
           1       0.56      0.54      0.55      1264

    accuracy                           0.55      2500
   macro avg       0.55      0.55      0.55      2500
weighted avg       0.55      0.55      0.55      2500

Accuracy score: 0.5524


Tuning the model using RandomSearchCV

In [19]:
from sklearn.model_selection import RandomizedSearchCV 
import numpy as np

#parameteres distributions
param_distributions = {
    'knn__n_neighbors': np.arange(3, 20, 2), #odd values for neighbors
    'knn__weights': ['uniform', 'distance'], #weighting methods
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'] #distance methods
}

#initializing the randomsearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter = 20, #no. of random combinations to try
    n_jobs = -1,
    random_state = 42,
    cv = 5, #5 fold cross validation
    scoring = 'accuracy'
)

In [21]:
#fitting the random_serach
random_search.fit(x_train, y_train)

In [22]:
#print the best parameters
print('Best parameters found:', random_search.best_params_)

#use the best model
best_model = random_search.best_estimator_

Best parameters found: {'knn__weights': 'uniform', 'knn__n_neighbors': np.int64(19), 'knn__metric': 'manhattan'}


In [23]:
#make predictions with val set with best model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

pred_values = best_model.predict(x_val)

print(confusion_matrix(y_val.values, pred_values))
print(classification_report(y_val.values, pred_values))
print('Accuracy_score:', accuracy_score(y_val.values, pred_values))

[[736 500]
 [573 691]]
              precision    recall  f1-score   support

           0       0.56      0.60      0.58      1236
           1       0.58      0.55      0.56      1264

    accuracy                           0.57      2500
   macro avg       0.57      0.57      0.57      2500
weighted avg       0.57      0.57      0.57      2500

Accuracy_score: 0.5708


Using the knn model on test data

In [26]:
print(test_df)

                                                  review  sentiment
0      When I was a kid, I loved "Tiny Toons". I espe...          1
1      The setup for "Nature of the Beast" is ingenio...          0
2      I do not have much to say than this is a great...          1
3      Extremely formulaic with cosmic-sized logic ho...          0
4      I actually liked certain things about this gam...          0
...                                                  ...        ...
24995  Start with the premise that you will do anythi...          0
24996  This movie gives us some WWII history along wi...          1
24997  In my opinion this is the best Oliver Stone fl...          1
24998  It's certainly a direct-to-video, but the stor...          0
24999  This movie was obscenely obvious and predictab...          0

[25000 rows x 2 columns]


In [27]:
x_test = process_text(test_df["review"].tolist())
y_test = test_df["sentiment"].values

In [29]:
print(x_test[ :5])

['kid love tini toon especi love tini toon spent summer vacat thought laugh floor funni year later friend video figur watch good old day floor laugh opinion plucki hampton skit best decid happi world land end have crazi adventur skit funni look video tip write funniest cartoon seen.10/10', 'setup natur beast ingeni simpl fraught limitless potenti suspens harri salesman jack domest lanc henriksen pick troubl form hitchhik adrian eric robert possess incrimin inform against jack million dollar stolen casino dub hatchet man dismemb peopl desert sound great right sort like hitcher meet psycho men secret unfortun writer director victor salva jeeper creeper powder fame idea movi go scenario arous tension suspens poison suppos thriller inclus under homo erot tone place henriksen evok unusu not earthbound everyman sport ampl gut robert threaten scari extra death wish suck point actual care happen conclus slide improb territori kept think go fight club mistaken bad natur beast ... instead', 'not

In [30]:
#testing the best model on test data
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

test_predictions = best_model.predict(x_test)

print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print('Accuracy_score:', accuracy_score(y_test, test_predictions))

[[7590 4910]
 [5220 7280]]
              precision    recall  f1-score   support

           0       0.59      0.61      0.60     12500
           1       0.60      0.58      0.59     12500

    accuracy                           0.59     25000
   macro avg       0.59      0.59      0.59     25000
weighted avg       0.59      0.59      0.59     25000

Accuracy_score: 0.5948
