# Training baseline model

This notebook shows the implementation of a baseline model for our movie genre classification problem.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import make_multilabel_classification

In [2]:
path = '../data/movies_data_ready.csv'
df = pd.read_csv(path)
df['genres'] = df['genres'].apply(lambda x: x.split(','))
df.head()

Unnamed: 0,title,overview,genres
0,Toy Story,led woody andys toys live happily room andys b...,"[Animation, Comedy, Family]"
1,Jumanji,siblings judy peter discover enchanted board g...,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,family wedding reignites ancient feud next doo...,"[Romance, Comedy]"
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,george banks recovered daughters wedding recei...,[Comedy]


To keep the title information: we will add the title to the overview, and delete the title column. 

In [3]:
df['overview'] = df['title'].apply(lambda x: x.lower()).astype(str) + ' ' + df['overview']
del df['title']
df.head()

Unnamed: 0,overview,genres
0,toy story led woody andys toys live happily ro...,"[Animation, Comedy, Family]"
1,jumanji siblings judy peter discover enchanted...,"[Adventure, Fantasy, Family]"
2,grumpier old men family wedding reignites anci...,"[Romance, Comedy]"
3,waiting to exhale cheated mistreated stepped w...,"[Comedy, Drama, Romance]"
4,father of the bride part ii george banks recov...,[Comedy]


In [4]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genres'])

# transform target variable
y = multilabel_binarizer.transform(df['genres'])
print(multilabel_binarizer.classes_)
print('size = ', len(multilabel_binarizer.classes_))

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Foreign' 'History' 'Horror' 'Music' 'Mystery'
 'Romance' 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']
size =  20


### Split the data in train, validate, test

In [5]:
testing_size = 0.15
x_train, x_test, y_train, y_test = train_test_split(df['overview'], y, test_size=testing_size, random_state=42)
validation_size_relative = testing_size/(1-testing_size)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_size_relative, random_state=42)

In [6]:
len(x_val) == len(y_val) == len(x_test) == len(y_test)

True

In [7]:
type(x_val)

pandas.core.series.Series

In [8]:
type(y_val)

numpy.ndarray

### Text vectorization

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=100000)
# create TF-IDF features
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
x_val_tfidf = tfidf_vectorizer.transform(x_val.values.astype('U'))

### Build ML model

In [10]:
# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

lr = LogisticRegression(solver='saga', n_jobs=-1, max_iter=1000)
clf = OneVsRestClassifier(lr)

# fit model on train data
clf.fit(x_train_tfidf, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn', n_jobs=-1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [11]:
# make predictions for validation set
y_pred = clf.predict(x_val_tfidf)

f1_score(y_val, y_pred, average='micro')

0.4578485405184731

In [12]:
multilabel_binarizer.inverse_transform(y_pred)[2155]

('Comedy',)

In [13]:
# predict probabilities
y_pred_prob = clf.predict_proba(x_val_tfidf)

t = 0.25 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

f1_score(y_val, y_pred_new, average="micro")

0.5889712049203244

In [14]:
# predict single observation:
desc_clean = 'othello evil iago pretend friend othello order manipulate serve end film version shakespeare classic'
f = []
f.append(desc_clean)
f = tfidf_vectorizer.transform(f)
pred = clf.predict(f)
tags = multilabel_binarizer.inverse_transform(pred)
type(pred)

numpy.ndarray

In [None]:
# o = 'Othello The evil Iago pretends to be friend of Othello in order to manipulate him to serve his own end in the film version of this Shakespeare classic.'
# o = o.lower()
# print(o)
# f_tras = tfidf_vectorizer.fit_transform(list(o))


In [35]:
X, y = make_multilabel_classification(n_samples=1000, n_classes=10, n_labels=3, allow_unlabeled=False, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
model = base_classifier = LogisticRegression(solver='saga', n_jobs=1, max_iter=100, verbose=False)
model = OneVsRestClassifier(base_classifier)
model.fit(x_train, y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn', n_jobs=1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='saga', tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False),
                    n_jobs=None)

In [36]:
ypred = model.predict(x_test)
f1_score(y_test, ypred, average='micro')

0.6508595139300534

### Random forest

In [None]:
from sklearn.multioutput import MultiOutputClassifier
forest = RandomForestClassifier(n_estimators=100, random_state=0, verbose=1, n_jobs=-1)
clf = MultiOutputClassifier(forest)
clf.fit(x_train_tfidf, y_train)

In [None]:
# make predictions for validation set
y_pred = clf.predict(x_val_tfidf)

f1_score(y_val, y_pred, average='micro')