In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Genre list for reference and fallback genre
genre_list = ['action', 'adult', 'adventure', 'animation', 'biography', 'comedy',
              'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history',
              'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
              'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
              'war', 'western']
fallback_genre = 'unknown'

# Load and preprocess the training data
try:
    with tqdm(total=50, desc='Loading Train Data') as pbar:
        train_data = pd.read_csv('/content/train_data.txt', sep=':::', header=None,
                                 names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')
        pbar.update(50)
except Exception as e:
    print("Error loading train data:", e)
    raise
print(train_data.head())

# Clean genre labels to remove unwanted spaces
train_data['GENRE'] = train_data['GENRE'].str.strip().str.lower()
X_train = train_data['DESCRIPTION'].str.lower()

# Process and binarize the genre labels
genre_labels = train_data['GENRE'].str.split(', ')
mlb = MultiLabelBinarizer(classes=genre_list)
y_train = mlb.fit_transform(genre_labels)

# Vectorize the training data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
with tqdm(total=50, desc='Vectorizing Training Data') as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

# Train the model
with tqdm(total=50, desc='Training Model') as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)

# Load the test dataset
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('/content/test_data.txt', sep=':::', header=None,
                                names=['ID', 'TITLE', 'DESCRIPTION'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

# Preprocess and vectorize the test data
X_test = test_data['DESCRIPTION'].str.lower()
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)

# Predict genres for the test data
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

# Prepare the output for test data
test_movie_names = test_data['TITLE']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})

# Replace empty predicted genres with the fallback genre
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if not genres else genres)

# Write the test results to an output file
with open("/content/test_data_solution.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        output_file.write(f"{movie_name}::: {genre_str}\n")

# Evaluate the model using the training data
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

# Calculate and display the evaluation metrics
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro', zero_division=0)
recall = recall_score(y_train, y_train_pred, average='micro', zero_division=0)
f1 = f1_score(y_train, y_train_pred, average='micro', zero_division=0)

# Append evaluation metrics to the output file
with open("model_evaluation.txt", "a", encoding="utf-8") as output_file:
    output_file.write("\n\nModel Evaluation Metrics:\n")
    output_file.write(f"Accuracy: {accuracy * 100:.2f}%\n")
    output_file.write(f"Precision: {precision:.2f}\n")
    output_file.write(f"Recall: {recall:.2f}\n")
    output_file.write(f"F1-score: {f1:.2f}\n")

print("Model evaluation results and metrics have been saved to 'model_evaluation.txt'.")


Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 143.79it/s]


   ID                               TITLE       GENRE  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         DESCRIPTION  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


Vectorizing Training Data: 100%|██████████| 50/50 [00:16<00:00,  3.12it/s]
Training Model: 100%|██████████| 50/50 [00:01<00:00, 27.31it/s]
Loading Test Data: 100%|██████████| 50/50 [00:01<00:00, 45.46it/s]
Vectorizing Test Data: 100%|██████████| 50/50 [00:13<00:00,  3.62it/s]
Predicting on Test Data: 100%|██████████| 50/50 [00:01<00:00, 31.32it/s]


Model evaluation results and metrics have been saved to 'model_evaluation.txt'.


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train on the split training set
multi_output_classifier.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = multi_output_classifier.predict(X_val_split)

# Calculate validation metrics
val_accuracy = accuracy_score(y_val_split, y_val_pred)
val_precision = precision_score(y_val_split, y_val_pred, average='micro')
val_recall = recall_score(y_val_split, y_val_pred, average='micro')
val_f1 = f1_score(y_val_split, y_val_pred, average='micro')

print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Validation Precision: {val_precision:.2f}")
print(f"Validation Recall: {val_recall:.2f}")
print(f"Validation F1-score: {val_f1:.2f}")


Validation Accuracy: 41.10%
Validation Precision: 0.74
Validation Recall: 0.23
Validation F1-score: 0.35
