# Movie Genre Prediction


> Presented by: Tanisha Verma


Aim: Predict the genre of a movie based on its plot summary and other features.

Importing Libraries

In [30]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm

List of Genre from training datase

In [31]:
Genre = ['action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western' ]
Unknown_Genre = 'Unknown'

Load Training Dataset

In [32]:
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv('/content/drive/MyDrive/DataSets/Movie Genre Prediction/train_data (1).txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

Loading Train Data: 100%|██████████| 50/50 [00:02<00:00, 19.32it/s]


PreProcessing training data

In [33]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(', ') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

In [34]:
count_vectorizer = CountVectorizer(max_features = 5000)

In [35]:
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_count = count_vectorizer.fit_transform(X_train)
    pbar.update(50)

Vectorizing Training Data: 100%|██████████| 50/50 [00:05<00:00,  9.20it/s]


In [36]:
with tqdm(total=50, desc="Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_count, y_train)
    pbar.update(50)

Training Model: 100%|██████████| 50/50 [00:00<00:00, 76.87it/s]


Load Test Dataset

In [37]:
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('/content/drive/MyDrive/DataSets/Movie Genre Prediction/test_data (1).txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 57.20it/s]


PreProcessing Test Dataset

In [38]:
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())

Applying Transforming

In [39]:
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_count = count_vectorizer.transform(X_test)
    pbar.update(50)

Vectorizing Test Data: 100%|██████████| 50/50 [00:00<00:00, 80.60it/s]


Predict Genre

In [40]:
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_count)
    pbar.update(50)

Predicting on Test Data: 100%|██████████| 50/50 [00:00<00:00, 820.30it/s]


Create DataFrame for Test data

In [41]:
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [Unknown_Genre] if len(genres) == 0 else genres)

Writing results to an output text file

In [42]:
with open("/content/drive/MyDrive/DataSets/Movie Genre Prediction/model_evaluation.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        output_file.write(f"{movie_name} ::: {genre_str}\n")

In [43]:
y_train_pred = multi_output_classifier.predict(X_train_count)

Calculate Evaluation Metrices

In [44]:
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')

Append evaluation metrics to output file


In [45]:
with open("model_evaluation.txt", "a", encoding="utf-8") as output_file:
    output_file.write("\n\nModel Evaluation Metrics:\n")
    output_file.write(f"Accuracy: {accuracy * 100:.2f}%\n")
    output_file.write(f"Precision: {precision:.2f}\n")
    output_file.write(f"Recall: {recall:.2f}\n")
    output_file.write(f"F1-score: {f1:.2f}\n")

In [46]:
print("Model evaluation results and metrics have been saved to 'model_evaluation.txt'.")

Model evaluation results and metrics have been saved to 'model_evaluation.txt'.
