In [98]:
#import all the nessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

In [99]:
#creating genere for all types
genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western' ]


In [100]:
fallback_genre = 'Unknown'

In [101]:

#reading the trained data

df = pd.read_csv('/content/train_data.txt', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [102]:
df.head()

Unnamed: 0,1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents,10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious,he refuses to speak to anyone except straight-talking Rose,the lady in pink he meets on the hospital stairs. As Christmas approaches,Rose uses her fantastical experiences as a professional wrestler,her imagination,wit and charm to allow Oscar to live life and love to the full,in the company of his friends Pop Corn,Einstein,Bacon and childhood sweetheart Peggy Blue.
0,2 ::: Cupid (1997) ::: thriller ::: A brother ...,,,,,,,,,
1,3 ::: Young,Wild and Wonderful (1980) ::: adult ::: As th...,little does the tour guide suspect that the s...,during the lecture films,the coeds drift into dreams of the most eroti...,they release the emotion of the fantasies in ...,but as the bus departs,everyone admits it was quite an education.,,
2,4 ::: The Secret Sin (1915) ::: drama ::: To h...,Edith and her twin sister Grace work as seams...,Grace falls prey to the temptations of Chinat...,a condition worsened by a misguided physician...,the family enjoys a new prosperity and the si...,a fellow oil prospector. To Grace's shock,Jack falls in love with Edith and in her jeal...,Grace tells Jack that Edith,not she,has a drug problem. Hinting that her sister w...
3,5 ::: The Unrecovered (2007) ::: drama ::: The...,but also to the state of the nation at large....,The Unrecovered examines the effect of terror...,the way a state of heightened anxiety and/or ...,empathy,and paranoia relate to one another in the wak...,,,,
4,6 ::: Quality Control (2011) ::: documentary :...,over a two day period,in a dry cleaners facility in Pritchard,Alabama,near Mobile,Quality Control exhibits the acts as well the...,"in Everson's words ""the fine folks of Alabama...",in form and rhythm,to certain scenarios in Everson's award-winni...,including Erie (IFFR 2010) and in thematic co...


**Loading Train Data**


In [103]:
#training the data with tqdm
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv('/content/train_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 102.92it/s]


In [104]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(', ') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

In [105]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features


**Vectorizing Training Data**

In [106]:
with tqdm(total=100, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(100)


Vectorizing Training Data: 100%|██████████| 100/100 [00:06<00:00, 15.39it/s]


In [107]:
with tqdm(total=100, desc="Training Model") as pbar: # training the model with navies bayes classifier
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(100)

Training Model: 100%|██████████| 100/100 [00:00<00:00, 119.47it/s]


**Loading test Data**








In [108]:
try:
    with tqdm(total=100, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('test_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')
        pbar.update(100)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

Loading Test Data: 100%|██████████| 100/100 [00:00<00:00, 193.32it/s]


In [109]:
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())


**Vectorizing Test Data**

In [111]:
with tqdm(total=100, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(100)

Vectorizing Test Data: 100%|██████████| 100/100 [00:06<00:00, 15.48it/s]


**Predicting Test Data**

In [112]:
with tqdm(total=100, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(100)

Predicting on Test Data: 100%|██████████| 100/100 [00:00<00:00, 197.58it/s]


In [113]:
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})

In [114]:
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)


In [115]:
with open("model_classification.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        output_file.write(f"{movie_name} ::: {genre_str}\n")

In [116]:
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

In [117]:
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')


**Saving the model in text file**

In [118]:
with open("model_classification.txt", "a", encoding="utf-8") as output_file:
    output_file.write("\n\nModel Evaluation Metrics:\n")
    output_file.write(f"Accuracy: {accuracy * 100:.2f}%\n")
    output_file.write(f"Precision: {precision:.2f}\n")
    output_file.write(f"Recall: {recall:.2f}\n")
    output_file.write(f"F1-score: {f1:.2f}\n")