In [1]:
import pandas as pd

movies_dev = pd.read_excel('Dev_Set/dev_set_groundtruth_and_trailers.xls',usecols=['movie','goodforairplane','filename'],index_col='filename')
movies_test = pd.read_excel('Test_set/dataset_complete.xlsx',names=['movie','filename','goodforairplane'],header=None, skiprows=1,index_col='filename',sep=';')
movies = pd.concat([movies_dev,movies_test])


In [2]:
import os
import numpy as np
import xml.etree.cElementTree as et

for filename in os.listdir('Dev_Set/XML'):
    tree = et.parse('Dev_Set/XML/'+filename)
    root = tree.getroot()
    tmp = root[0]
    
    for i in tmp.attrib:
        movies.loc[filename[:-4],i] = tmp.get(i)
        
for filename in os.listdir('Test_Set/XML'):
    tree = et.parse('Test_Set/XML/'+filename)
    root = tree.getroot()
    tmp = root[0]
    for i in tmp.attrib:
        movies.loc[filename[:-4],i] = tmp.get(i)
        

In [3]:
movies.drop(['released','Website','imdbID','poster','tomatoConsensus','writer','DVD','plot','title','awards'], axis=1, inplace=True)

In [4]:
movies = movies.replace('N/A','NaN')
movies['imdbRating'] = movies['imdbRating'].astype(np.float)
movies['imdbVotes'] = movies['imdbVotes'].replace('[,]','',regex=True).astype(float)
movies['runtime'] = movies['runtime'].replace('[\smin]','',regex=True).astype(float)
movies['rated'] = movies['rated'].replace('NaN','NOT RATED')
movies['year'] = movies['year'].astype(float)
movies['metascore'] = movies['metascore'].astype(float)
movies['tomatoRating'] = movies['tomatoRating'].astype(float)
movies['tomatoUserRating'] = movies['tomatoUserRating'].astype(float)
movies['tomatoMeter'] = movies['tomatoMeter'].astype(float)
movies['tomatoUserMeter'] = movies['tomatoUserMeter'].astype(float)



In [5]:
movies.fillna(0,inplace=True)

In [6]:
metadata = movies[['language','year','genre','country','runtime','rated']]
userrating = movies[['imdbRating','metascore','tomatoRating','tomatoUserRating','tomatoMeter','tomatoUserMeter']]
meta_and_user = movies[['language','year','genre','country','runtime','rated','imdbRating','metascore','tomatoRating',
                       'tomatoUserRating','tomatoMeter','tomatoUserMeter']]

In [7]:
import Encoding as enc
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    enc.one_hot_encode(metadata,'language',True)
    enc.one_hot_encode(metadata,'genre',True)
    enc.one_hot_encode(metadata, 'country',True)
    enc.one_hot_encode(metadata, 'rated',True)

metadata.drop(['language','country','rated','genre'], axis=1, inplace=True)

In [8]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    enc.one_hot_encode(meta_and_user,'language',True)
    enc.one_hot_encode(meta_and_user, 'country',True)
    enc.one_hot_encode(meta_and_user, 'genre',True)
    enc.one_hot_encode(meta_and_user, 'rated',True)

meta_and_user.drop(['language','country','rated','genre'], axis=1, inplace=True)

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)
nc = NearestCentroid(metric='euclidean', shrink_threshold=None)
tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
                              class_weight=None, presort=False)
log = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)
svm = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)
bag = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
                        bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=-1, random_state=0, verbose=0)
rf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=-1, random_state=0, verbose=0, warm_start=False, class_weight=None)
ada = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=0)
gb = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
gauss = GaussianNB()

In [10]:
from sklearn.model_selection import cross_val_score

algo_pool = [knn,nc,tree,log,svm,bag,rf,ada,gb,gauss]

for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=metadata[0:95],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=metadata[0:95],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=metadata[0:95],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')

Precision: 0.5956349206349205, Recall: 0.6366666666666666, f1: 0.5642607392607393
Precision: 0.5595238095238094, Recall: 0.5366666666666666, f1: 0.5382478632478632
Precision: 0.49000000000000005, Recall: 0.5666666666666668, f1: 0.5061033411033411
Precision: 0.5125, Recall: 0.56, f1: 0.5244599844599845
Precision: 0.5000396825396825, Recall: 0.6766666666666667, f1: 0.5602319902319903
Precision: 0.5275, Recall: 0.47000000000000003, f1: 0.4561627261627262
Precision: 0.42444444444444446, Recall: 0.42666666666666664, f1: 0.40921356421356425
Precision: 0.5001587301587301, Recall: 0.5033333333333333, f1: 0.47556943056943063
Precision: 0.48908730158730157, Recall: 0.56, f1: 0.5094194694194694
Precision: 0.45, Recall: 0.13333333333333336, f1: 0.19523809523809527


In [11]:
for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=meta_and_user[0:95],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=meta_and_user[0:95],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=meta_and_user[0:95],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')

Precision: 0.6058333333333332, Recall: 0.6566666666666665, f1: 0.6193706293706293
Precision: 0.5913095238095238, Recall: 0.6133333333333334, f1: 0.5925019425019424
Precision: 0.4910714285714285, Recall: 0.53, f1: 0.479965034965035
Precision: 0.5650793650793652, Recall: 0.5833333333333334, f1: 0.5491286491286491
Precision: 0.5140873015873015, Recall: 0.73, f1: 0.6006327006327006
Precision: 0.5066666666666666, Recall: 0.45000000000000007, f1: 0.457965367965368
Precision: 0.5066666666666666, Recall: 0.48, f1: 0.48386169386169386
Precision: 0.46464285714285714, Recall: 0.45, f1: 0.44125485625485633
Precision: 0.4966666666666667, Recall: 0.48999999999999994, f1: 0.47472027972027975
Precision: 0.5, Recall: 0.17, f1: 0.23968253968253966


In [None]:
import LVW

metadata_selected_features = LVW.lvw(metadata[0:95],movies[0:95].loc[:,'goodforairplane'],74,0.)
meta_and_user_selected_fetures = LVW.lvw(meta_and_user[0:95],movies[0:95].loc[:,'goodforairplane'],74,0.)

In [None]:
meta_algo_pool = [knn,nc,tree,log,svm,bag,rf,ada,gb]
text_algo_pool = [gauss,knn,svm]
visual_algo_pool = [knn,tree,log,svm,rf,ada,gb]
audio_algo_pool = [log,gb]

In [None]:
for i in meta_algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=metadata[0:95].loc[metadata_selected_features],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=metadata[0:95].loc[metadata_selected_features],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=metadata[0:95].loc[metadata_selected_features],y=movies[0:95].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')
    

In [None]:
knn_meta = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)

nc_meta = NearestCentroid(metric='euclidean', shrink_threshold=None)

tree_meta = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
                              class_weight=None, presort=False)

log_meta = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)

svm_meta = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)

bag_meta = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
                        bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=-1, random_state=0, verbose=0)

rf_meta = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=-1, random_state=0, verbose=0, warm_start=False, class_weight=None)

ada_meta = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=0)

gb_meta = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

gauss_text = GaussianNB()

knn_text = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)

svm_text = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)

knn_vis = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)

tree_vis = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
                              class_weight=None, presort=False)

log_vis = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)

svm_vis = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)

rf_vis = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=-1, random_state=0, verbose=0, warm_start=False, class_weight=None)

ada_vis = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=0)

gb_vis = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

log_audio = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)

gb_audio = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

estimators = [('knn_meta',knn_meta),('nc_meta',nc_meta),('tree_meta',tree_meta),('log_meta',log_meta),('svm_meta',svm_meta),
              ('bag_meta',bag_meta),('rf_meta',rf_meta),('ada_meta',ada_meta),('gb_meta',gb_meta),('gauss_text',gauss_text),
              ('knn_text',knn_text),('svm_text',svm_text),('knn_vis',knn_vis),('tree_vis',tree_vis),('log_vis',log_vis),
              ('svm_vis',svm_vis),('rf_vis',rf_vis),('ada_vis',ada_vis),('gb_vis',gb_vis),('log_audio',log_audio),('gb_audio',gb_audio)]


In [None]:
from stacking_classifiers import stacking_classifier_performance_cv

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    res = stacking_classifier_performance_cv(estimators,metadata,metadata,metadata,metadata,movies[0:95].loc[:,'goodforairplane'])
    print(res)
