<a href="https://colab.research.google.com/github/Prakriti1103/CampusValleyDataSet/blob/main/campusTask1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm


genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western' ]

fallback_genre = 'Unknown'

# Load train data
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv("/content/train_data.txt", sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

# Preprocess train data
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = train_data['GENRE'].str.split(',').fillna('').apply(lambda x: [genre.strip() for genre in x])
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', token_pattern=r'\b\w{2,}\b')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 599.71it/s]


In [None]:
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

# Train Random Forest Classifier
with tqdm(total=50, desc="Training Model") as pbar:
    rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
    multi_output_classifier = MultiOutputClassifier(rf_classifier)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)

Vectorizing Training Data: 100%|██████████| 50/50 [00:01<00:00, 49.55it/s]
Training Model: 100%|██████████| 50/50 [00:49<00:00,  1.01it/s]


In [None]:
train_data

Unnamed: 0,SerialNumber,MOVIE_NAME,GENRE,MOVIE_PLOT
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
9555,9556,No Stone Unturned (2017),documentary,Alex Gibney reopens the mysterious unsolved c...
9556,9557,La vraie nature de Bernadette (1972),drama,A woman imbued with naturalistic and libertar...
9557,9558,The Comedy of Errors (2012),comedy,Mendocino College Theatre Department presents...
9558,9559,Beyond Good & Evil (2008),drama,Chris Peterson is an aging homicide detective...


In [None]:
# Preprocess test data
X_test = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels_test = train_data['GENRE'].str.split(',').fillna('').apply(lambda x: [genre.strip() for genre in x])

# TF-IDF vectorization for test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict on test data
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

# Inverse transform predictions to genre labels
predicted_genres = mlb.inverse_transform(y_pred)

# Evaluate model performance
accuracy = accuracy_score(y_train, multi_output_classifier.predict(X_train_tfidf))
precision = precision_score(y_train, multi_output_classifier.predict(X_train_tfidf), average='micro')
recall = recall_score(y_train, multi_output_classifier.predict(X_train_tfidf), average='micro')
f1 = f1_score(y_train, multi_output_classifier.predict(X_train_tfidf), average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Predicting on Test Data: 100%|██████████| 50/50 [00:05<00:00,  8.45it/s]


Accuracy: 99.12%
Precision: 1.00
Recall: 0.99
F1-score: 1.00
