###   Import libraries

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

###  Load the train data

In [2]:
# Read the training data file
train_data = []
with open("train_data.txt", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            train_data.append(parts)

# Create DataFrame
train_df = pd.DataFrame(train_data, columns=["ID", "Title", "Genre", "Description"])
train_df.head()


Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


###  Load the test data

In [3]:
test_data = []
with open("test_data.txt", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split(" ::: ")
        if len(parts) == 3:
            test_data.append(parts)

test_df = pd.DataFrame(test_data, columns=["ID", "Title", "Description"])
test_df.head()

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


### Load test data solutions for evaluation later

In [28]:
# Read and split lines
with open("test_data_solution.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Each line has 4 components, but we only want ID and GENRE
fixed_data = [line.strip().split(" ::: ") for line in lines]

# Convert to DataFrame with all 4 columns
solution_df = pd.DataFrame(fixed_data, columns=["ID", "TITLE", "GENRE", "DESCRIPTION"])

# Keep only ID and GENRE
solution_df = solution_df[["ID", "GENRE"]].rename(columns={"GENRE": "Genre"})

# Ensure ID is string for merging
solution_df["ID"] = solution_df["ID"].astype(str)


In [29]:
solution_df.head()

Unnamed: 0,ID,Genre
0,1,thriller
1,2,comedy
2,3,documentary
3,4,drama
4,5,drama


#### now we will do some Basic cleaning 

In [5]:

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Clean the text data
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()                                  # Lowercase
    text = re.sub(r"[^a-zA-Z]", " ", text)               # Remove punctuation & digits
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [7]:
# Apply cleaning
train_df["Clean_Description"] = train_df["Description"].apply(clean_text)
test_df["Clean_Description"] = test_df["Description"].apply(clean_text)

train_df[["Description", "Clean_Description"]].head()

Unnamed: 0,Description,Clean_Description
0,Listening in to a conversation between his doc...,listening conversation doctor parent year old ...
1,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...
2,As the bus empties the students for their fiel...,bus empty student field trip museum natural hi...
3,To help their unemployed father make ends meet...,help unemployed father make end meet edith twi...
4,The film's title refers not only to the un-rec...,film title refers un recovered body ground zer...


In [8]:
# CLeaning has been applied, now we can proceed with the next steps.

#### Now we will do Feature Extraction using TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [10]:
# Fit on training descriptions
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["Clean_Description"])

# Transform test descriptions
X_test_tfidf = tfidf_vectorizer.transform(test_df["Clean_Description"])


In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["Genre"])

# For mapping predicted labels back to genre later
genre_classes = label_encoder.classes_


#### let's check for some shapes

In [12]:
print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)
print("Target vector shape:", y_train.shape)


Train TF-IDF shape: (54214, 5000)
Test TF-IDF shape: (54200, 5000)
Target vector shape: (54214,)


#### we are now moving on to training and evaluation

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train_split, y_val = train_test_split(
    X_train_tfidf, y_train, test_size=0.2, random_state=42, stratify=y_train
)


In [14]:
# firstly we will use Logistic Regression as a baseline model

In [15]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train_split)

In [16]:
from sklearn.metrics import classification_report, accuracy_score

y_pred_val = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("\nClassification Report:\n", classification_report(y_val, y_pred_val, target_names=genre_classes))

Validation Accuracy: 0.5799133081250576

Classification Report:
               precision    recall  f1-score   support

      action       0.49      0.27      0.35       263
       adult       0.71      0.29      0.41       118
   adventure       0.51      0.12      0.20       155
   animation       0.56      0.09      0.16       100
   biography       0.00      0.00      0.00        53
      comedy       0.52      0.59      0.55      1490
       crime       0.20      0.03      0.05       101
 documentary       0.67      0.85      0.75      2619
       drama       0.54      0.76      0.63      2723
      family       0.44      0.08      0.13       157
     fantasy       0.00      0.00      0.00        65
   game-show       0.90      0.46      0.61        39
     history       0.00      0.00      0.00        49
      horror       0.66      0.59      0.62       441
       music       0.65      0.42      0.51       146
     musical       0.75      0.05      0.10        55
     mystery    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### we got a pretty decent accuracy as 58%

In [17]:
test_predictions = model.predict(X_test_tfidf)

# Convert encoded predictions to genre labels
test_genre_preds = label_encoder.inverse_transform(test_predictions)


#### Let's now create a submission dataframe to compare our predictions with the provided test solutions

In [18]:
test_results_df = test_df.copy()
test_results_df["Predicted_Genre"] = test_genre_preds

test_results_df[["ID", "Predicted_Genre"]].to_csv("genre_predictions.csv", index=False)


#### now finally at last we will load our predictions.csv and compare it with solution_df

In [19]:
pred_df = pd.read_csv("genre_predictions.csv")

In [30]:
# Ensure both IDs are strings
pred_df["ID"] = pred_df["ID"].astype(str)
solution_df["ID"] = solution_df["ID"].astype(str)


In [31]:

# Merge predictions with actual genres
merged_df = pd.merge(pred_df, solution_df, on="ID", how="inner")
merged_df.rename(columns={"Genre": "Actual_Genre"}, inplace=True)

# Clean up genres
merged_df["Predicted_Genre"] = merged_df["Predicted_Genre"].str.strip().str.lower()
merged_df["Actual_Genre"] = merged_df["Actual_Genre"].str.strip().str.lower()


In [32]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(merged_df["Actual_Genre"], merged_df["Predicted_Genre"])
print("🎯 Final Test Accuracy:", accuracy)


🎯 Final Test Accuracy: 0.5795018450184501


#### Saving the model and code 

In [33]:
import joblib
joblib.dump(model, "genre_classifier.pkl")


['genre_classifier.pkl']

In [34]:
merged_df.to_csv("final_predictions.csv", index=False)
