<a href="https://colab.research.google.com/github/VaibhavR2004/Array/blob/main/MovieGenre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Movie Genre Classification


importing necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

List of Genres Learnt from the Training DATA


In [None]:
genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western' ]


In [None]:
fallback_genre = 'Unknown'

In [None]:
import pandas as pd
from tqdm import tqdm

try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv(r"/content/train_data.txt",
                                 sep=':::',
                                 header=None,
                                 names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'],
                                 engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise


Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 70.91it/s]


##Data preprocessing for Training data


In [None]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda x: x.lower())
genre_labels = [genre.split(',') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

## TF-IDF Vectorization

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

##Fit and transform the training data with progress bar

In [None]:
with tqdm(total=50, desc="Transforming Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

Transforming Training Data: 100%|██████████| 50/50 [00:16<00:00,  3.05it/s]


##Train a MultiOutput Naive Bayes Classifier using the traing data

In [None]:
with tqdm(total=50, desc="Training Naive Bayes Classifier") as pbar:
    multi_output_classifier = MultiOutputClassifier(MultinomialNB())
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)

Training Naive Bayes Classifier: 100%|██████████| 50/50 [00:02<00:00, 23.47it/s]


##LOad your Test dataset from_test_data.txt

In [None]:
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv(r'/content/train_data.txt',sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

Loading Test Data: 100%|██████████| 50/50 [00:01<00:00, 48.89it/s]


##Data preprocessing for test data

In [None]:
X_test= test_data['MOVIE_PLOT'].astype(str).apply(lambda x: x.lower())

##Transform the test data with progress bar


In [None]:
with tqdm(total=50, desc="Transforming Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)

Transforming Test Data: 100%|██████████| 50/50 [00:07<00:00,  6.27it/s]


##Predict genre on the test data


In [None]:
with tqdm(total=50, desc="Predicting Genres") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

Predicting Genres: 100%|██████████| 50/50 [00:00<00:00, 94.30it/s]


##Create a DataFrame for test data with movie name and predicted genre

In [None]:
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'GENRE': predicted_genres})

##Replace empty unpredicted genres with the fallback genre

In [None]:
test_results['GENRE'] = test_results['GENRE'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)
test_results['Serial_Number']=test_data['SerialNumber']
test_results['MOVIE_PLOT']=test_data['MOVIE_PLOT']

##Write the result to an Output File with Proper Formatting

In [None]:
with open("test_data_solution.txt", "w", encoding="utf-8" ) as output_file:
    for _, row in test_results.iterrows():
      id=row['Serial_Number']
      movie_name = row['MOVIE_NAME']
      genre_str = ','.join(row['GENRE'])
      plot=row['MOVIE_PLOT']
      output_file.write(f"{id}:::{movie_name}:::{genre_str}:::{plot}\n")


Calculate Evaluation metrics using training labels(as a proxy)

In [None]:
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

##Calculate Evaluation Metrics

In [None]:
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')

##Print accuracy precision recall f1-score

In [None]:
print ("\n\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy*100: .2f}%\n")
print(f"Precision: {precision:.2f}\n")
print(f"Recall: {recall: 2f}\n")
print(f"F1-score: {f1:.2f}\n")



Model Evaluation Metrics:
Accuracy:  27.73%

Precision: 0.72

Recall:  0.280629

F1-score: 0.40

