TRAINING A KNN MODEL FOR THE IMDB DATASET

In [None]:
#Extracting data from zipped file and reading it
import tarfile
import os

#specifying file_path
file_path = 'Dataset.tar.gz'

# Open and extract the .tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path="extracted_files")  # Extract files to the directory

# Check the contents of the extracted folder
extracted_files = os.listdir("extracted_files/aclImdb")
print(extracted_files[:10])  # Prints only the first 10 entries

In [None]:
#Converting the training dataset into a pandas dataset
import pandas as pd
import os

# Function to read reviews and labels from a directory
def load_data(directory, label):
    reviews = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return pd.DataFrame({'review': reviews, 'sentiment': label})

# Load positive and negative reviews for training
train_pos = load_data('extracted_files/aclImdb/train/pos', label=1)
train_neg = load_data('extracted_files/aclImdb/train/neg', label=0)

# Combine into one DataFrame
train_df = pd.concat([train_pos, train_neg], ignore_index=True)

# Display the first few rows
print(train_df.head())


In [None]:
#Shuffling the training dataset
train_df = train_df.sample(frac = 1, random_state = 42).reset_index(drop = True)
print(train_df.head())

In [None]:
#Converting the test dataset into a pandas dataset
import pandas as pd
import os

# Function to read reviews and labels from a directory
def load_data(directory, label):
    reviews = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return pd.DataFrame({'review': reviews, 'sentiment': label})

# Load positive and negative reviews for training
test_pos = load_data('extracted_files/aclImdb/test/pos', label=1)
test_neg = load_data('extracted_files/aclImdb/test/neg', label=0)

# Combine into one DataFrame
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

# Display the first few rows
print(test_df.head())

In [None]:
#Shuffling test dataset
test_df = test_df.sample(frac = 1, random_state = 42).reset_index(drop = True)

print(test_df.head())

In [None]:
#EDA on Training dataset
#checking the distributions
train_df["sentiment"].value_counts()

In [None]:
#checking the missing values
print(train_df.isna().sum())

DATA PREPROCESSING AND TOKENIZATION, LEMMATIZATION

In [None]:
import spacy
import re
import nltk
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS

#load english model for spacy
nlp = spacy.load("en_core_web_sm")

#get the stop words list from spacy
stop_words = list(STOP_WORDS)

#words to exclude from stop_words
excluding = ['against', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

#final stop_words list
final_stop_words = [word for word in stop_words if word not in excluding]

#initialzing the stemmer
snow = SnowballStemmer('english')

#function for processing the text
def process_text(texts):
    final_text_list = []

    for sent in texts:
        #set sent to empty if not a string
        if not isinstance(sent,str):
            final_text_list.append("")
            continue

        #basic processing steps before tokenization
        sent = sent.lower().strip()
        sent = re.sub(r'\s+', ' ', sent) # Multiple spaces and tabs into one
        sent = re.sub('<.*?>', '', sent) #remove html tags

        #applying spacy nlp pipeline to sent
        doc = nlp(sent)

        filtered_sentence = []

        for token in doc:
            if len(token.text)>2 and token.text not in final_stop_words and not token.is_digit:
                filtered_sentence.append(snow.stem(token.text))

        #join final string of cleaned sentences
        final_list = " ".join(filtered_sentence)
        final_text_list.append(final_list)

    return final_text_list

Train - validation split

In [None]:
#splitting training dataset into train and validation
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_df["review"],
                                                 train_df["sentiment"],
                                                 test_size = 0.10,
                                                 shuffle = True,
                                                 random_state = 324)

In [None]:
#checking the datatype of x_train
print(type(x_train))

In [None]:
print(x_train.head())  # Print the first few rows
print(x_train.index)  # Print index if it's Series

In [None]:
#calling the function to process the columns
print("processing the review column")
x_train = process_text(x_train.tolist())
x_val = process_text(x_val.tolist())

Pipeline Creation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

#######PIPELINE########

pipeline = Pipeline([
    ('text_vect', CountVectorizer(binary = True,
                                 max_features = 15)),
    ('knn', KNeighborsClassifier())
])

#display pipeline
from sklearn import set_config
set_config(display = 'diagram')
pipeline

In [None]:
print(x_train[ :5])

Fit The Pipeline

In [None]:
pipeline.fit(x_train, y_train.values)

Test the classifier on validation dataset

In [None]:
from sklearn.metrics import confusion_matrix,  classification_report, accuracy_score

#predicting on validation set
pred_values = pipeline.predict(x_val)

print(confusion_matrix(y_val.values, pred_values))
print(classification_report(y_val.values, pred_values))
print('Accuracy score:', accuracy_score(y_val.values, pred_values))

Tuning the model using RandomSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV 
import numpy as np

#parameteres distributions
param_distributions = {
    'knn__n_neighbors': np.arange(3, 20, 2), #odd values for neighbors
    'knn__weights': ['uniform', 'distance'], #weighting methods
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'] #distance methods
}

#initializing the randomsearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter = 20, #no. of random combinations to try
    n_jobs = -1,
    random_state = 42,
    cv = 5, #5 fold cross validation
    scoring = 'accuracy'
)

In [None]:
#fitting the random_serach
random_search.fit(x_train, y_train)

In [None]:
#print the best parameters
print('Best parameters found:', random_search.best_params_)

#use the best model
best_model = random_search.best_estimator_

In [None]:
#make predictions with val set with best model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

pred_values = best_model.predict(x_val)

print(confusion_matrix(y_val.values, pred_values))
print(classification_report(y_val.values, pred_values))
print('Accuracy_score:', accuracy_score(y_val.values, pred_values))

Using the knn model on test data

In [None]:
print(test_df)

In [None]:
x_test = process_text(test_df["review"].tolist())
y_test = test_df["sentiment"].values

In [None]:
print(x_test[ :5])

In [None]:
#testing the best model on test data
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

test_predictions = best_model.predict(x_test)

print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print('Accuracy_score:', accuracy_score(y_test, test_predictions))