In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from tqdm import tqdm

In [None]:
genre_list=['action','adult','adventure','animation','biogrraphy','comedy','crime','documentary','family','fantasy','game-show','history','horror','music','musical','mystery','news','reality-tv','romance','sci-fi','short','sport','talk-show','thriller','war','western']

In [None]:
fallback_genre='Unknown'

In [None]:
try:
  with tqdm(total=50,desc="Loding Train Data") as pbar:
    train_data=pd.read_csv('train_data.txt',sep=':::',header=None,names=['SerialNumber','MOVIE_NAME','GENRE','MOVIE_PLOT'],engine='python')
    pbar.update(50)
except Exception as e:
  print(f"ERROR loading train_data: {e}")
  raise


Loding Train Data: 100%|██████████| 50/50 [00:00<00:00, 92.01it/s]


In [None]:
X_train=train_data['MOVIE_PLOT'].astype(str).apply(lambda doc:doc.lower())
genre_labels=[genre.split(',') for genre in train_data['GENRE']]
mlb=MultiLabelBinarizer()
y_train=mlb.fit_transform(genre_labels)

In [None]:
tfidf_vectorizer=TfidfVectorizer(max_features=5000)

In [None]:
with tqdm(total=50,desc="Vectorizing Training Data") as pbar:
  X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
  pbar.update(50)


Vectorizing Training Data: 100%|██████████| 50/50 [00:05<00:00,  9.32it/s]


In [None]:
with  tqdm(total=50,desc="Training Model") as pbar:
  naive_bayes=MultinomialNB()
  naive_bayes_classifier=MultiOutputClassifier(naive_bayes)
  naive_bayes_classifier.fit(X_train_tfidf,y_train)
  pbar.update(50)

Training Model: 100%|██████████| 50/50 [00:03<00:00, 16.02it/s]


In [None]:
try:
  with tqdm(total=50,desc="Loding Test Data") as pbar:
    test_data=pd.read_csv('test_data.txt',sep=':::',header=None,names=['SerialNumber','MOVIE_NAME','GENRE','MOVIE_PLOT'],engine='python')
    pbar.update(50)
except Exception as e:
  print(f"ERROR loading test_data: {e}")
  raise

Loding Test Data: 100%|██████████| 50/50 [00:00<00:00, 67.62it/s]


In [None]:
X_test=test_data['MOVIE_PLOT'].astype(str).apply(lambda doc:doc.lower())

In [None]:
with tqdm(total=50,desc="Vectorizing Test Data") as pbar:
  X_test_tfidf=tfidf_vectorizer.transform(X_test)
  pbar.update(50)

Vectorizing Test Data: 100%|██████████| 50/50 [00:00<00:00, 283.74it/s]


In [None]:
with tqdm(total=50,desc="Predicting Test Data") as pbar:
  y_pred=naive_bayes_classifier.predict(X_test_tfidf)
  pbar.update(50)

Predicting Test Data: 100%|██████████| 50/50 [00:00<00:00, 232.08it/s]


In [None]:
test_movie_names=test_data['MOVIE_NAME']
predicted_genres=mlb.inverse_transform(y_pred)
test_results=pd.DataFrame({'MOVIE_NAME':test_movie_names,'PREDICTED_GENRE':predicted_genres})


In [None]:
test_results['PREDICTED_GENRE']=test_results['PREDICTED_GENRE'].apply(lambda genres:[fallback_genre] if len(genres)==0 else genres)

In [None]:
with open("model_evalution.txt", "w",encoding="utf-8") as output_file :
    for _, row in test_results.iterrows():
      movie_name=row['MOVIE_NAME']
      predicted_genres=row['PREDICTED_GENRE']
      genre_str=','.join(predicted_genres)
      output_file.write(f"{movie_name}:::{genre_str}\n")

In [None]:
y_train_pred=naive_bayes_classifier.predict(X_train_tfidf)

In [None]:
accuracy=accuracy_score(y_train,y_train_pred)
precision=precision_score(y_train,y_train_pred,average='macro')
recall=recall_score(y_train,y_train_pred,average='macro')
f1=f1_score(y_train,y_train_pred,average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
with open("model_evaluation.txt","a",encoding="utf-8") as output_file:
  output_file.write("\n\n model evaluation metrices:\n")
  output_file.write(f"accuracy: {accuracy * 100:.2f}%\n")
  output_file.write(f"precision: {precision:.2f}\n")
  output_file.write(f"recall: {recall:.2f}\n")
  output_file.write(f"f1: {f1:.2f}\n")

In [None]:
print("model evaluation results and metrices have been saved to 'model_evaluation.txt' .")

model evaluation results and metrices have been saved to 'model_evaluation.txt' .
