In [1]:
# Step 1: Importing All Necessary Libraries :

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm


In [2]:
# Load the Training dataset from train_data.txt
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv(r'D:\C_INTERSHIPS\CODSOFT\PROJECTS\MOVIE GENRE CLASSIFICATION\train_data.txt'
, sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 188.06it/s]


In [3]:
# Data preprocessing for training data
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(', ') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

In [4]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features

In [5]:
# Fit and transform the training data with progress bar
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [6]:
# Train a MultiOutput Naive Bayes classifier using the training data
naive_bayes = MultinomialNB()
multi_output_classifier = MultiOutputClassifier(naive_bayes)
multi_output_classifier.fit(X_train_tfidf, y_train)

In [7]:
# Load your test dataset from test_data.txt
test_data = pd.read_csv('test_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')


In [8]:
# Data preprocessing for test data
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())

In [9]:
# Transform the test data with progress bar
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
# Predict genres on the test data
y_pred = multi_output_classifier.predict(X_test_tfidf)

In [11]:
fallback_genre = 'Unknown'

In [12]:
# Create a DataFrame for test data with movie names and predicted genres
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})

In [13]:
# Replace empty unpredicted genres with the fallback genre
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)

In [14]:
# Write the results to an output text file with proper formatting
with open("model_evaluation.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        output_file.write(f"{movie_name} ::: {genre_str}\n")


In [15]:
# Calculate evaluation metrics using training labels (as a proxy)
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

In [16]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')

In [17]:
# Append the evaluation metrics to the output file
with open("model_evaluation.txt", "a", encoding="utf-8") as output_file:
    output_file.write("\n\nModel Evaluation Metrics:\n")
    output_file.write(f"Accuracy: {accuracy * 100:.2f}%\n")
    output_file.write(f"Precision: {precision:.2f}\n")
    output_file.write(f"Recall: {recall:.2f}\n")
    output_file.write(f"F1-score: {f1:.2f}\n")

In [18]:
print("Model evaluation results and metrics have been saved to 'model_evaluation.txt'.")


Model evaluation results and metrics have been saved to 'model_evaluation.txt'.
