# Training baseline model

This notebook shows the implementation of a baseline model for our movie genre classification problem.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import multilabel_confusion_matrix

Load the data and keep the title information: we will add the title to the overview, and delete the title column. 

In [6]:
path = '../data/movies_data_ready.csv'
df = pd.read_csv(path)
df['genres'] = df['genres'].apply(lambda x: x.split(','))
df['overview'] = df['title'].apply(lambda x: x.lower()).astype(str) + ' ' + df['overview']
del df['title']
df.head()

Unnamed: 0,overview,genres
0,toy story toy story lead woody andys toy live ...,"[Animation, Comedy, Family]"
1,jumanji jumanji sibling judy peter discover en...,"[Adventure, Fantasy, Family]"
2,grumpier old men grumpier old men family wed r...,"[Romance, Comedy]"
3,waiting to exhale waiting to exhale cheat mist...,"[Comedy, Drama, Romance]"
4,father of the bride part ii father of the brid...,[Comedy]


### Split the data in train, validate, test

In [7]:
testing_size = 0.15
x_train, x_test, y_train, y_test = train_test_split(df['overview'], y, test_size=testing_size, random_state=42)
validation_size_relative = testing_size/(1-testing_size)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_size_relative, random_state=42)
len(x_val) == len(y_val) == len(x_test) == len(y_test)

True

### One-hot vector representation and TF-IDF:

In [8]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genres'])

# transform target variable
y = multilabel_binarizer.transform(df['genres'])
print(multilabel_binarizer.classes_)
print('size = ', len(multilabel_binarizer.classes_))

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Foreign' 'History' 'Horror' 'Music' 'Mystery'
 'Romance' 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']
size =  20


In [9]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=100000)
# create TF-IDF features
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
x_val_tfidf = tfidf_vectorizer.transform(x_val.values.astype('U'))

### ML model : Logistic regression

In [10]:
lr = LogisticRegression(solver='saga', n_jobs=-1, max_iter=1000)
clf = OneVsRestClassifier(lr)

# fit model on train data
clf.fit(x_train_tfidf, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn', n_jobs=-1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [11]:
# predict probabilities
y_pred_prob = clf.predict_proba(x_val_tfidf)

t = 0.25 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

f1_score(y_val, y_pred_new, average="micro")

0.5903856825749167

In [12]:
precision_recall_fscore_support(y_val, y_pred_new, average='micro')

(0.5540654905279604, 0.6318016479845594, 0.5903856825749167, None)

### ML model: SVC

In [13]:
svc = LinearSVC()
clf = OneVsRestClassifier(svc)

# fit model on train data
clf.fit(x_train_tfidf, y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0),
                    n_jobs=None)

In [14]:
# make predictions for validation set
y_pred = clf.predict(x_val_tfidf)

f1_score(y_val, y_pred, average='micro')

0.5442039779199159

In [15]:
precision_recall_fscore_support(y_val, y_pred, average='micro')

(0.6639230358097274, 0.4610645089451414, 0.5442039779199159, None)