In [None]:
# This notebook uses TF-IDF and machine learning models (Logistic Regression, SVM, Naive Bayes) to predict movie genres
# from the combined title and description of movies.

Importing Libraries

In [None]:
import numpy as np
import nltk
import string
import pandas as pd
import joblib
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Cleaning Features

In [None]:
# Convert to lowercase, removes - (Extra space, puntuation)
def clean_text(text):
  text = text.lower()
  text = text.strip()
  text = text.translate(str.maketrans('', '', string.punctuation))
  return text

In [None]:
# Loading Data
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data_solution.csv')

# Applying transformations
train_df['Description'] = train_df['Description'].apply(clean_text)
train_df['Title'] = train_df['Title'].apply(clean_text)

test_df['Description'] = test_df['Description'].apply(clean_text)
test_df['Title'] = test_df['Title'].apply(clean_text)

FileNotFoundError: [Errno 2] No such file or directory: 'train_data.csv'

Remove Stopwords

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

In [None]:


train_df['Description'] = train_df['Description'].apply(remove_stopwords)
test_df['Description'] = test_df['Description'].apply(remove_stopwords)

Encoding Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train = le.fit_transform(train_df['Genre'])
y_test = le.transform(test_df['Genre'])

Encoding Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # Converts text to no.

train_df['text'] = train_df['Title'] + ' ' + train_df['Description']
test_df['text'] = test_df['Title'] + ' ' + test_df['Description']

tfidf = TfidfVectorizer(
    max_features=20000, # max no. of words
    ngram_range=(1, 3), # include single, double, triple words/phrases
    min_df=3, # Ignore words that appear less than {mentioned} times
    max_df=0.9 # Ignore very common words that appear in more than 90% of data
)

X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'saga']
}
lr_model = LogisticRegression(multi_class='multinomial', max_iter=2000, n_jobs=-1)
lr_grid_search = GridSearchCV(lr_model, lr_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
lr_grid_search.fit(X_train, y_train)

# Best Parameters
lr_best_model = lr_grid_search.best_estimator_
print(f"Best Parameters: {lr_grid_search.best_params_}")

# Predict
y_pred_lr = lr_best_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy_lr:.4f}")

# Detailed classification report
# print(classification_report(y_test, y_pred, target_names=le.classes_))



Best Parameters: {'C': 1, 'solver': 'saga'}
Accuracy: 0.5987


Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC

svm_param_grid = {
    'C': [0.1, 1, 10]
}

svm_model = LinearSVC(max_iter=1000)
svm_grid_search = GridSearchCV(svm_model, svm_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(X_train, y_train)

# Best Parameters
svm_best_model = svm_grid_search.best_estimator_
print(f"Best Parameters: {svm_grid_search.best_params_}")

# Predict
y_pred_svm = svm_best_model.predict(X_test)

# Accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm:.4f}")

# Detailed classification report
# print(classification_report(y_test, y_pred, target_names=le.classes_))

Best Parameters: {'C': 0.1}
Accuracy: 0.6003


Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nv_model = MultinomialNB()
nv_model.fit(X_train, y_train)

# Predict
y_pred_nv = nv_model.predict(X_test)

accuracy_nv = accuracy_score(y_test, y_pred_nv)
print(f"Accuracy: {accuracy_nv:.4f}")

# Detailed classification report
# print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.5124


Saving Best Model

In [None]:
joblib.dump(svm_best_model, 'svm_genre_prediction_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']

Loading Best Model  (SVM) and make custom prediction

In [None]:
model = joblib.load('svm_genre_prediction_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
le = joblib.load('label_encoder.pkl')

text = input("Enter Title and Description:")
text_cleaned = clean_text(text)
text_no_stop = remove_stopwords(text_cleaned)
X_input = vectorizer.transform([text_no_stop])
pred = model.predict(X_input)
genre = le.inverse_transform(pred)
print(genre[0])

Enter Title and Description:Iron man Since that first suit built in a cave, Tony has created dozens of new suits and upgrades over the years. However, throughout the 50-plus Iron Man models, there are common offensive and defense capabilities found in most iterations.  The primary weapon contained within every suit, the repulsor rays use energy pulses to repel and disrupt enemies and are generated through the suit’s gauntlets. The suit’s booster jets enable Stark to fly fast enough to break the sound barrier, and maneuver more quickly than any fighter jet.  Iron Man’s helmet provides Tony with a heads-up display that gives him 360-degree vision, access to information about his surroundings and enemies, and the ability to transmit and block transmissions along any frequency. The helmet also gives Tony a degree of resistance to EMP and psychic-based attacks.  A weapon centered in Iron Man’s chest, the unibeam is capable of projecting dazzling light, and can also be used as a powerful for