In [15]:
# ============================================
# Movie Genre Classification using TF-IDF
# Models: Naive Bayes, Logistic Regression, SVM
# Dataset: IMDb Genre Classification (Kaggle)
# ============================================

# --------- 1. Imports ---------
import pandas as pd
import numpy as np
import re
import nltk
import kagglehub
import os

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report

# Download stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# --------- 2. Download & Load Dataset (kagglehub) ---------
path = kagglehub.dataset_download("hijest/genre-classification-dataset-imdb")
print("Path to dataset files:", path)

# Dataset file (IMDb format)
DATA_PATH = os.path.join(path, "Genre Classification Dataset", "train_data.txt")

df = pd.read_csv(
    DATA_PATH,
    sep=":::",
    engine="python",
    names=["id", "title", "genre", "description"]
)

# Keep required columns only
df = df[['description', 'genre']].dropna()

print("Dataset Shape:", df.shape)
print(df.head())

# --------- 3. Text Preprocessing ---------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    words = [word for word in text.split() if word not in STOPWORDS]
    return " ".join(words)

df['clean_description'] = df['description'].apply(clean_text)

# --------- 4. Train-Test Split ---------
X = df['clean_description']
y = df['genre']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# --------- 5. TF-IDF Vectorization ---------
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF Feature Shape:", X_train_tfidf.shape)

# --------- 6. Model Training & Evaluation ---------
def evaluate_model(model, name):
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print("\n" + "=" * 50)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n")
    print(classification_report(y_test, y_pred))
    print("=" * 50)

# ---- Naive Bayes ----
nb_model = MultinomialNB()
evaluate_model(nb_model, "Multinomial Naive Bayes")

# ---- Logistic Regression ----
lr_model = LogisticRegression(max_iter=300, n_jobs=-1)
evaluate_model(lr_model, "Logistic Regression")

# ---- Support Vector Machine ----
svm_model = LinearSVC()
evaluate_model(svm_model, "Linear SVM")

# --------- 7. Sample Prediction ---------
def predict_genre(text):
    text = clean_text(text)
    vector = tfidf.transform([text])
    return svm_model.predict(vector)[0]

sample_text = "A group of astronauts travel through space to save humanity"
print("\nSample Prediction:", predict_genre(sample_text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using Colab cache for faster access to the 'genre-classification-dataset-imdb' dataset.
Path to dataset files: /kaggle/input/genre-classification-dataset-imdb
Dataset Shape: (54214, 2)
                                         description       genre
0   Listening in to a conversation between his do...      drama 
1   A brother and sister with a past incestuous r...   thriller 
2   As the bus empties the students for their fie...      adult 
3   To help their unemployed father make ends mee...      drama 
4   The film's title refers not only to the un-re...      drama 
TF-IDF Feature Shape: (43371, 50000)

Model: Multinomial Naive Bayes
Accuracy: 0.4688739278797381
Classification Report:

               precision    recall  f1-score   support

      action        0.00      0.00      0.00       263
       adult        1.00      0.01      0.02       118
   adventure        0.00      0.00      0.00       155
   animation        0.00      0.00      0.00       100
   biography        0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: Logistic Regression
Accuracy: 0.5824033938946785
Classification Report:



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.57      0.21      0.31       263
       adult        0.74      0.24      0.36       118
   adventure        0.70      0.10      0.18       155
   animation        0.75      0.03      0.06       100
   biography        0.00      0.00      0.00        53
      comedy        0.53      0.59      0.56      1490
       crime        0.50      0.01      0.02       101
 documentary        0.66      0.88      0.75      2619
       drama        0.53      0.80      0.64      2723
      family        0.59      0.06      0.11       157
     fantasy        0.00      0.00      0.00        65
   game-show        1.00      0.38      0.56        39
     history        0.00      0.00      0.00        49
      horror        0.69      0.54      0.61       441
       music        0.66      0.30      0.41       146
     musical        0.00      0.00      0.00        55
     mystery        0.00      0.00      0.00        64
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
