In [37]:
import pandas as pd
import numpy as np

In [38]:
data_overview = pd.read_csv('./Data/movies_overview.csv')
data_tags = pd.read_csv('./Data/movies_genres.csv')
data_overview.head()

Unnamed: 0,title,overview,genre_ids
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]"
4,12 Angry Men,The defense and the prosecution have rested an...,[18]


In [39]:
data_tags.head()
tag_map = dict(zip(data_tags['id'], data_tags['name']))
print(tag_map)

{28: 'Action', 12: 'Adventure', 16: 'Animation', 35: 'Comedy', 80: 'Crime', 99: 'Documentary', 18: 'Drama', 10751: 'Family', 14: 'Fantasy', 36: 'History', 27: 'Horror', 10402: 'Music', 9648: 'Mystery', 10749: 'Romance', 878: 'Science Fiction', 10770: 'TV Movie', 53: 'Thriller', 10752: 'War', 37: 'Western'}


In [40]:
import ast

In [41]:
if isinstance(data_overview['genre_ids'].iloc[0], str):
    data_overview['genre_ids'] = data_overview['genre_ids'].apply(ast.literal_eval)

In [42]:
def ids_to_names(tag_id_str):
    return [tag_map.get(tag_id, "UNKNOWN") for tag_id in tag_id_str]
    

In [43]:
data_overview['tag_names'] = data_overview['genre_ids'].apply(ids_to_names)

In [44]:
data_overview.head()

Unnamed: 0,title,overview,genre_ids,tag_names
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]","[Drama, Crime]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]","[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]","[Drama, Crime]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]","[Drama, History, War]"
4,12 Angry Men,The defense and the prosecution have rested an...,[18],[Drama]


In [45]:
def clean_text(text):
    text = text.lower()
    return text

In [58]:
data_overview['overview'] = data_overview['overview'].apply(clean_text)
data_overview['title'] = data_overview['title'].apply(clean_text)   

In [47]:
data_overview.head()

Unnamed: 0,title,overview,genre_ids,tag_names
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"[18, 80]","[Drama, Crime]"
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","[18, 80]","[Drama, Crime]"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"[18, 80]","[Drama, Crime]"
3,schindler's list,the true story of how businessman oskar schind...,"[18, 36, 10752]","[Drama, History, War]"
4,12 angry men,the defense and the prosecution have rested an...,[18],[Drama]


In [48]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = pd.DataFrame(mlb.fit_transform(data_overview['tag_names']), columns=mlb.classes_)

print(y)

      Action  Adventure  Animation  Comedy  Crime  Drama  Family  Fantasy  \
0          0          0          0       0      1      1       0        0   
1          0          0          0       0      1      1       0        0   
2          0          0          0       0      1      1       0        0   
3          0          0          0       0      0      1       0        0   
4          0          0          0       0      0      1       0        0   
...      ...        ...        ...     ...    ...    ...     ...      ...   
9975       1          1          0       1      0      0       0        1   
9976       0          0          0       0      1      0       0        0   
9977       0          0          0       0      0      1       0        0   
9978       1          1          0       0      0      0       0        0   
9979       0          0          0       1      0      0       0        0   

      History  Horror  Music  Mystery  Romance  Science Fiction  TV Movie  

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')),
        ('clf', OneVsRestClassifier(LogisticRegression(solver='saga', max_iter=1000, n_jobs=-1)))
    ])

In [60]:
x = data_overview['overview']
xt = (data_overview['title'] + ' ').str.strip() + ' ' + data_overview['overview']
print(x)


0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955, a chronicle o...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9975    two brothers have half of a powerful ancient c...
9976    a rabid film fan stalks his favorite action he...
9977    18-year-old penny cooper spent years pining fo...
9978    four young outsiders teleport to a dangerous u...
9979    carlo, on vacation in south africa with his se...
Name: overview, Length: 9980, dtype: object


In [55]:
from sklearn.model_selection import train_test_split

x_temp, x_test, y_temp, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.2, random_state=42)


In [None]:
print("Only overview")
print(f"Train set size: {len(x_train)}")
print(f"Validation set size: {len(x_val)}")
print(f"Test set size: {len(x_test)}")  
print(f"Number of classes: {len(y.columns)}")


pipe.fit(x_train, y_train)

y_valid_pred = pipe.predict(x_val)
y_test_pred = pipe.predict(x_test)

from sklearn.metrics import f1_score, hamming_loss
val_micro_f1 = f1_score(y_val, y_valid_pred, average='micro')
val_macro_f1 = f1_score(y_val, y_valid_pred, average='macro')
val_hamming_loss = hamming_loss(y_val, y_valid_pred)

print(f"Validation Micro F1: {val_micro_f1}")
print(f"Validation Macro F1: {val_macro_f1}")
print(f"Validation Hamming Loss: {val_hamming_loss}")

test_micro_f1 = f1_score(y_test, y_test_pred, average='micro')
test_macro_f1 = f1_score(y_test, y_test_pred, average='macro')
test_hamming_loss = hamming_loss(y_test, y_test_pred)
print(f"Test Micro F1: {test_micro_f1}")
print(f"Test Macro F1: {test_macro_f1}")
print(f"Test Hamming Loss: {test_hamming_loss}")

Validation Micro F1: 0.3716971094165632
Validation Macro F1: 0.18273811267482173
Validation Hamming Loss: 0.12325193070340221
Test Micro F1: 0.3874404261283992
Test Macro F1: 0.19566661015775558
Test Hamming Loss: 0.12163215319527945


In [62]:
xt_temp, xt_test, yt_temp, yt_test = train_test_split(xt, y, test_size=0.2, random_state=42)
xt_train, xt_val, yt_train, yt_val = train_test_split(xt_temp, yt_temp, test_size=0.2, random_state=42)
print("Title + Overview")

Title + Overview


In [63]:
print("Title + Overview")
print(f"Train set size: {len(xt_train)}")
print(f"Validation set size: {len(xt_val)}")
print(f"Test set size: {len(xt_test)}")
pipe.fit(xt_train, yt_train)

yt_valid_pred = pipe.predict(xt_val)
yt_test_pred = pipe.predict(xt_test)

val_micro_f1_xt = f1_score(yt_val, yt_valid_pred, average='micro')
val_macro_f1_xt = f1_score(yt_val, yt_valid_pred, average='macro')
val_hamming_loss_xt = hamming_loss(yt_val, yt_valid_pred)
print(f"Validation Micro F1 (Title + Overview): {val_micro_f1_xt}")
print(f"Validation Macro F1 (Title + Overview): {val_macro_f1_xt}") 
print(f"Validation Hamming Loss (Title + Overview): {val_hamming_loss_xt}")
test_micro_f1_xt = f1_score(yt_test, yt_test_pred, average='micro')
test_macro_f1_xt = f1_score(yt_test, yt_test_pred, average='macro')
test_hamming_loss_xt = hamming_loss(yt_test, yt_test_pred)
print(f"Test Micro F1 (Title + Overview): {test_micro_f1_xt}")
print(f"Test Macro F1 (Title + Overview): {test_macro_f1_xt}")
print(f"Test Hamming Loss (Title + Overview): {test_hamming_loss_xt}")
# This code is for multi-label classification of movie tags using TF-IDF and logistic regression.


Title + Overview
Train set size: 6387
Validation set size: 1597
Test set size: 1996
Validation Micro F1 (Title + Overview): 0.3878809230227233
Validation Macro F1 (Title + Overview): 0.1972600435663337
Validation Hamming Loss (Title + Overview): 0.12088638419258331
Test Micro F1 (Title + Overview): 0.3990492170022371
Test Macro F1 (Title + Overview): 0.20620775511251552
Test Hamming Loss (Title + Overview): 0.11962814517924739


In [None]:
'''
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

cls = MultiOutputClassifier(RandomForestClassifier())
cls.fit(x_train, y_train)
'''

In [None]:
'''
y_valid_pred = cls.predict(x_val)
y_valid_pred_f = pd.DataFrame(y_valid_pred, columns=y.columns)
'''

In [None]:
'''
from sklearn.metrics import classification_report

print("📊 Validation Results:")
for col in y.columns:
    print(f"\n--- {col} ---")
    print(classification_report(y_val[col], y_valid_pred_f[col]))
    '''

In [None]:
'''
y_test_pred = cls.predict(x_test)
print("📊 Test Results:")
for col in y.columns:
    print(f"\n--- {col} ---")
    print(classification_report(y_test[col], y_test_pred[:, y.columns.get_loc(col)]))
'''

In [None]:
'''
from sklearn.metrics import accuracy_score

# Label-wise exact match
print("Exact Match Accuracy (Validation):", accuracy_score(y_val, y_valid_pred))
'''

In [None]:
'''
import joblib
joblib.dump(cls, "multi_label_model.pkl")
#Would Comments on what to do next
'''


In [None]:
#review