In [None]:
#Importing the essential Libraries and tools
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Defining File Path And File Structure
from numpy.random import test
DATA_DIR = "/content/Data"

train_path = os.path.join(DATA_DIR, "train_data.txt")
test_path = os.path.join(DATA_DIR, "test_data.txt")
solution_path = os.path.join(DATA_DIR, "test_data_solution.txt")

print("Train File Exists: ", os.path.exists(train_path))
print("Test File Exists: ", os.path.exists(test_path))
print("Solution:", os.path.exists(solution_path))


Train File Exists:  True
Test File Exists:  True
Solution: True


In [None]:
#Loading the dataset
train_df = pd.read_csv(
    train_path,
    sep=":::",
    engine="python",
    header=None,
    names=["title", "genre", "plot"]
)

test_df = pd.read_csv(
    test_path,
    sep=":::",
    engine="python",
    header=None,
    names=["title", "genre", "plot"]
)

solution_df = pd.read_csv(
    solution_path,
    sep=":::",
    engine="python",
    header=None,
    names=["title", "genre", "plot"]
)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
print("Solution shape:", solution_df.shape)

Train shape: (54214, 3)
Test  shape: (54200, 3)
Solution shape: (54200, 3)


In [None]:
# Quick look at the data
print(train_df.head())

                                title  ...                                               plot
1       Oscar et la dame rose (2009)   ...   Listening in to a conversation between his do...
2                       Cupid (1997)   ...   A brother and sister with a past incestuous r...
3   Young, Wild and Wonderful (1980)   ...   As the bus empties the students for their fie...
4              The Secret Sin (1915)   ...   To help their unemployed father make ends mee...
5             The Unrecovered (2007)   ...   The film's title refers not only to the un-re...

[5 rows x 3 columns]


In [None]:
print("\nGenre value counts (train):")
print(train_df["genre"].value_counts().head(10))


Genre value counts (train):
genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
Name: count, dtype: int64


In [None]:
# Train/Validation Split of the dataset
X = train_df["plot"]
y = train_df["genre"]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


print("\nTrain samples:", len(X_train))
print("Val samples:", len(X_val))


Train samples: 43371
Val samples: 10843


In [None]:
# Defining Models
model = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVC": LinearSVC()
}

results = {}
best_model = None
best_name = None
best_acc = 0

In [None]:
# Training Each Model on Training Datasplit
from sklearn import pipeline
for name, clf in model.items():
  print("Training:", name)


  pipeline = Pipeline([
      ("tfidf", TfidfVectorizer(
          stop_words="english",
          max_features=20000,
          ngram_range=(1, 2)
      )),
      ("clf", clf)

      ])

  pipeline.fit(X_train, y_train)
  preds = pipeline.predict(X_val)

  acc = accuracy_score(y_val, preds)
  print(f"Validation Accuracy = {acc:.4f}")
  print(classification_report(y_val, preds))

  results[name] = acc

  if acc > best_acc:
    best_acc = acc
    best_model = pipeline
    best_name = name

Training: Logistic Regression
Validation Accuracy = 0.5817
               precision    recall  f1-score   support

      action        0.55      0.24      0.33       263
       adult        0.74      0.26      0.39       118
   adventure        0.67      0.12      0.20       155
   animation        0.57      0.04      0.07       100
   biography        0.00      0.00      0.00        53
      comedy        0.52      0.58      0.55      1490
       crime        0.25      0.01      0.02       101
 documentary        0.66      0.87      0.75      2619
       drama        0.54      0.79      0.64      2723
      family        0.57      0.08      0.13       157
     fantasy        0.00      0.00      0.00        65
   game-show        1.00      0.33      0.50        39
     history        0.00      0.00      0.00        49
      horror        0.69      0.56      0.62       441
       music        0.70      0.35      0.47       146
     musical        1.00      0.02      0.04        55
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy = 0.5017
               precision    recall  f1-score   support

      action        1.00      0.02      0.03       263
       adult        0.50      0.01      0.02       118
   adventure        0.67      0.03      0.05       155
   animation        0.00      0.00      0.00       100
   biography        0.00      0.00      0.00        53
      comedy        0.54      0.38      0.44      1490
       crime        0.00      0.00      0.00       101
 documentary        0.55      0.91      0.69      2619
       drama        0.44      0.84      0.58      2723
      family        0.00      0.00      0.00       157
     fantasy        0.00      0.00      0.00        65
   game-show        0.00      0.00      0.00        39
     history        0.00      0.00      0.00        49
      horror        0.85      0.15      0.25       441
       music        1.00      0.01      0.03       146
     musical        0.00      0.00      0.00        55
     mystery        0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy = 0.5682
               precision    recall  f1-score   support

      action        0.43      0.37      0.40       263
       adult        0.64      0.43      0.52       118
   adventure        0.40      0.22      0.28       155
   animation        0.33      0.15      0.21       100
   biography        0.00      0.00      0.00        53
      comedy        0.52      0.57      0.54      1490
       crime        0.24      0.07      0.11       101
 documentary        0.68      0.81      0.74      2619
       drama        0.56      0.69      0.62      2723
      family        0.31      0.13      0.19       157
     fantasy        0.14      0.05      0.07        65
   game-show        0.82      0.59      0.69        39
     history        0.29      0.04      0.07        49
      horror        0.60      0.62      0.61       441
       music        0.57      0.47      0.52       146
     musical        0.38      0.05      0.10        55
     mystery        0.13      0.03 

In [None]:
# Showing Which Model is Best
for k, v in results.items():
    print(f"{k:20s} : {v:.4f}")
print(f"Best Model: {best_name} with accuracy = {best_acc:.4f}")

Logistic Regression  : 0.5817
Naive Bayes          : 0.5017
Linear SVC           : 0.5682
Best Model: Logistic Regression with accuracy = 0.5817


In [None]:
# Final Evaluation on Test Set
X_test = test_df["plot"]
y_test = solution_df["genre"]

test_preds = best_model.predict(X_test)     # Using best model = Logistic regresstion

test_acc = accuracy_score(y_test, test_preds)
print(f"Test Accuracy = {test_acc:.4f}")
print(classification_report(y_test, test_preds))

Test Accuracy = 0.5843


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.54      0.25      0.34      1314
       adult        0.64      0.19      0.30       590
   adventure        0.72      0.14      0.24       775
   animation        0.53      0.03      0.06       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.58      0.56      7446
       crime        0.50      0.02      0.03       505
 documentary        0.66      0.87      0.75     13096
       drama        0.53      0.80      0.64     13612
      family        0.57      0.06      0.11       783
     fantasy        0.75      0.01      0.02       322
   game-show        0.93      0.45      0.61       193
     history        0.00      0.00      0.00       243
      horror        0.67      0.55      0.60      2204
       music        0.72      0.39      0.50       731
     musical        0.50      0.00      0.01       276
     mystery        1.00      0.00      0.01       318
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
