# Training baseline model

This notebook shows the implementation of a baseline model for our movie genre classification problem.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [3]:
path = '../data/movies_data_ready.csv'
df = pd.read_csv(path)
df['genres'] = df['genres'].apply(lambda x: x.split(','))
df.head()

Unnamed: 0,title,overview,genres
0,Toy Story,lead woody andys toy live happily room andys b...,"[Animation, Comedy, Family]"
1,Jumanji,sibling judy peter discover enchant board game...,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,family wed reignites ancient feud next door ne...,"[Romance, Comedy]"
3,Waiting to Exhale,cheat mistreat stepped woman hold breath wait ...,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,george bank recover daughter wed receives news...,[Comedy]


To keep the title information: we will add the title to the overview, and delete the title column. 

In [4]:
df['overview'] = df['title'].apply(lambda x: x.lower()).astype(str) + ' ' + df['overview']
del df['title']
df.head()

Unnamed: 0,overview,genres
0,toy story lead woody andys toy live happily ro...,"[Animation, Comedy, Family]"
1,jumanji sibling judy peter discover enchant bo...,"[Adventure, Fantasy, Family]"
2,grumpier old men family wed reignites ancient ...,"[Romance, Comedy]"
3,waiting to exhale cheat mistreat stepped woman...,"[Comedy, Drama, Romance]"
4,father of the bride part ii george bank recove...,[Comedy]


In [5]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genres'])

# transform target variable
y = multilabel_binarizer.transform(df['genres'])
print(multilabel_binarizer.classes_)
print('size = ', len(multilabel_binarizer.classes_))

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Foreign' 'History' 'Horror' 'Music' 'Mystery'
 'Romance' 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']
size =  20


### Split the data in train, validate, test

In [6]:
testing_size = 0.15
x_train, x_test, y_train, y_test = train_test_split(df['overview'], y, test_size=testing_size, random_state=42)
validation_size_relative = testing_size/(1-testing_size)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_size_relative, random_state=42)

In [7]:
len(x_val) == len(y_val) == len(x_test) == len(y_test)

True

### Text vectorization

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
# create TF-IDF features
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
x_val_tfidf = tfidf_vectorizer.transform(x_val.values.astype('U'))

### Build ML model

In [9]:
# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

# fit model on train data
clf.fit(x_train_tfidf, y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [10]:
# make predictions for validation set
y_pred = clf.predict(x_val_tfidf)

f1_score(y_val, y_pred, average='micro')

0.4808521179093386

In [11]:
multilabel_binarizer.inverse_transform(y_pred)[2155]

('Comedy',)

In [12]:
# predict probabilities
y_pred_prob = clf.predict_proba(x_val_tfidf)

t = 0.25 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

f1_score(y_val, y_pred_new, average="micro")

0.5942203699238797

### Random forest

In [None]:
from skmultilearn.problem_transform import BinaryRelevance

clf = BinaryRelevance(classifier=RandomForestClassifier(max_depth=10, random_state=0))
clf.fit(x_train_tfidf, y_train)



In [None]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

f1_score(y_val, y_pred, average='micro')