In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
data_overview = pd.read_csv('./Data/movies_overview.csv')
data_tags = pd.read_csv('./Data/movies_genres.csv')
data_overview.head()

Unnamed: 0,title,overview,genre_ids
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]"
4,12 Angry Men,The defense and the prosecution have rested an...,[18]


In [7]:
data_tags.head()
tag_map = dict(zip(data_tags['id'], data_tags['name']))
print(tag_map)

{28: 'Action', 12: 'Adventure', 16: 'Animation', 35: 'Comedy', 80: 'Crime', 99: 'Documentary', 18: 'Drama', 10751: 'Family', 14: 'Fantasy', 36: 'History', 27: 'Horror', 10402: 'Music', 9648: 'Mystery', 10749: 'Romance', 878: 'Science Fiction', 10770: 'TV Movie', 53: 'Thriller', 10752: 'War', 37: 'Western'}


In [16]:
import ast

In [17]:
if isinstance(data_overview['genre_ids'].iloc[0], str):
    data_overview['genre_ids'] = data_overview['genre_ids'].apply(ast.literal_eval)

In [18]:
def ids_to_names(tag_id_str):
    return [tag_map.get(tag_id, "UNKNOWN") for tag_id in tag_id_str]
    

In [19]:
data_overview['tag_names'] = data_overview['genre_ids'].apply(ids_to_names)

In [20]:
data_overview.head()

Unnamed: 0,title,overview,genre_ids,tag_names
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]","[Drama, Crime]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]","[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]","[Drama, Crime]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]","[Drama, History, War]"
4,12 Angry Men,The defense and the prosecution have rested an...,[18],[Drama]


In [22]:
def clean_text(text):
    text = text.lower()
    return text

In [26]:
data_overview['overview'] = data_overview['overview'].apply(clean_text)

In [27]:
data_overview.head()

Unnamed: 0,title,overview,genre_ids,tag_names
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"[18, 80]","[Drama, Crime]"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","[18, 80]","[Drama, Crime]"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"[18, 80]","[Drama, Crime]"
3,Schindler's List,the true story of how businessman oskar schind...,"[18, 36, 10752]","[Drama, History, War]"
4,12 Angry Men,the defense and the prosecution have rested an...,[18],[Drama]


In [48]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = pd.DataFrame(mlb.fit_transform(data_overview['tag_names']), columns=mlb.classes_)

print(y)

      Action  Adventure  Animation  Comedy  Crime  Drama  Family  Fantasy  \
0          0          0          0       0      1      1       0        0   
1          0          0          0       0      1      1       0        0   
2          0          0          0       0      1      1       0        0   
3          0          0          0       0      0      1       0        0   
4          0          0          0       0      0      1       0        0   
...      ...        ...        ...     ...    ...    ...     ...      ...   
9975       1          1          0       1      0      0       0        1   
9976       0          0          0       0      1      0       0        0   
9977       0          0          0       0      0      1       0        0   
9978       1          1          0       0      0      0       0        0   
9979       0          0          0       1      0      0       0        0   

      History  Horror  Music  Mystery  Romance  Science Fiction  TV Movie  

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(data_overview['overview'])
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 366833 stored elements and shape (9980, 27298)>
  Coords	Values
  (0, 12171)	0.14579400665265557
  (0, 12193)	0.06750699106969224
  (0, 24407)	0.1012447889279244
  (0, 157)	0.16459891381466452
  (0, 9503)	0.14384278709273546
  (0, 7366)	0.13739937343449538
  (0, 16334)	0.09770021474792584
  (0, 17138)	0.05893577017422688
  (0, 11458)	0.1594406338801949
  (0, 26695)	0.0850288548785995
  (0, 1229)	0.05773192049187382
  (0, 11301)	0.05301270013095928
  (0, 14555)	0.12470827717475987
  (0, 25778)	0.1880118279335787
  (0, 2219)	0.1628930954291103
  (0, 1257)	0.15717602695634336
  (0, 7574)	0.4055674641380299
  (0, 2518)	0.0902999682325825
  (0, 16703)	0.06579211250579142
  (0, 14229)	0.06235014305146205
  (0, 1812)	0.06326287820807458
  (0, 21933)	0.20278373206901495
  (0, 18980)	0.21237580996986788
  (0, 26617)	0.08008738504975998
  (0, 11153)	0.05049313568352747
  :	:
  (9978, 7706)	0.14029241708735496
  (9978, 18280)	0.2093926

In [50]:
from sklearn.model_selection import train_test_split

x_temp, x_test, y_temp, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.2, random_state=42)


In [51]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

cls = MultiOutputClassifier(RandomForestClassifier())
cls.fit(x_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [53]:
y_valid_pred = cls.predict(x_val)
y_valid_pred_f = pd.DataFrame(y_valid_pred, columns=y.columns)

In [55]:
from sklearn.metrics import classification_report

print("📊 Validation Results:")
for col in y.columns:
    print(f"\n--- {col} ---")
    print(classification_report(y_val[col], y_valid_pred_f[col]))

📊 Validation Results:

--- Action ---
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      1229
           1       0.82      0.14      0.24       368

    accuracy                           0.79      1597
   macro avg       0.81      0.56      0.56      1597
weighted avg       0.80      0.79      0.73      1597


--- Adventure ---
              precision    recall  f1-score   support

           0       0.83      1.00      0.91      1324
           1       1.00      0.01      0.02       273

    accuracy                           0.83      1597
   macro avg       0.92      0.51      0.46      1597
weighted avg       0.86      0.83      0.76      1597


--- Animation ---
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1470
           1       0.80      0.06      0.12       127

    accuracy                           0.92      1597
   macro avg       0.86      0.53      0.54      159

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [57]:
y_test_pred = cls.predict(x_test)
print("📊 Test Results:")
for col in y.columns:
    print(f"\n--- {col} ---")
    print(classification_report(y_test[col], y_test_pred[:, y.columns.get_loc(col)]))


📊 Test Results:

--- Action ---
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      1520
           1       0.85      0.16      0.27       476

    accuracy                           0.79      1996
   macro avg       0.82      0.58      0.57      1996
weighted avg       0.81      0.79      0.73      1996


--- Adventure ---
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      1670
           1       0.83      0.03      0.06       326

    accuracy                           0.84      1996
   macro avg       0.84      0.51      0.49      1996
weighted avg       0.84      0.84      0.77      1996


--- Animation ---
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1805
           1       1.00      0.07      0.13       191

    accuracy                           0.91      1996
   macro avg       0.96      0.53      0.54      1996
weig

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])

In [58]:
from sklearn.metrics import accuracy_score

# Label-wise exact match
print("Exact Match Accuracy (Validation):", accuracy_score(y_val, y_valid_pred))


Exact Match Accuracy (Validation): 0.053224796493425174


In [59]:
import joblib
joblib.dump(cls, "multi_label_model.pkl")

['multi_label_model.pkl']