In [2]:
import pandas as pd
import numpy as np

# First the movie titles and the labels are loaded from their respective files and then concatenated to one dataframe

movies_dev = pd.read_excel('Dev_Set/dev_set_groundtruth_and_trailers.xls',usecols=['movie','goodforairplane','filename'],index_col='filename')
movies_test = pd.read_csv('Test_set/test_set_labels.csv',names=['movie','filename','goodforairplane'],header=None, skiprows=1,index_col='filename',sep=';',dtype={'goodforairplane':int})
movies_test.sort_index(inplace=True)

movies = movies_dev.append(movies_test, sort=False)


In [3]:
import os
import xml.etree.cElementTree as et

# Now all the metadata and userratings are loaded from the xml files

for filename in os.listdir('Dev_Set/XML'):
    tree = et.parse('Dev_Set/XML/'+filename)
    root = tree.getroot()
    tmp = root[0]
    
    for i in tmp.attrib:
        movies.loc[filename[:-4],i] = tmp.get(i)

for filename in os.listdir('Test_Set/XML'):
    tree = et.parse('Test_Set/XML/'+filename)
    root = tree.getroot()
    tmp = root[0]
    for i in tmp.attrib:
        movies.loc[filename[:-4],i] = tmp.get(i)
        
        

In [4]:
# Here the visual data is loaded.

visuals = pd.DataFrame()
names = []
for file in os.listdir("Dev_Set/vis_descriptors"):
    data = pd.read_csv(os.path.join(r"Dev_Set/vis_descriptors",file),header=None)
    name = file.rsplit('.',1)
    name = name[0]
    names.append(name)
    data = data.mean(axis=0)
    data = data.transpose()
    visuals = visuals.append(data,ignore_index=True)
    
for file in os.listdir("Test_Set/vis_descriptors"):
    data = pd.read_csv(os.path.join(r"Test_Set/vis_descriptors",file),header=None)
    name = file.rsplit('.',1)
    name = name[0]
    names.append(name)
    data = data.mean(axis=0)
    data = data.transpose()
    visuals = visuals.append(data,ignore_index=True)
    
visuals.insert(0,'filename',names)
visuals.set_index('filename',inplace=True)

In [5]:
# Here the audio data

audio = pd.DataFrame()
names_train = []
names = []

for file in os.listdir("Dev_Set/audio_descriptors"):
    data = pd.read_csv(os.path.join(r"Dev_Set/audio_descriptors",file),header=None)
    name = file.rsplit('.',1)
    name = name[0]
    names_train.append(name)
    names.append(name)
    data = data.mean(axis=1)
    audio = audio.append(data,ignore_index=True)
 
names_test = []

for file in os.listdir("Test_Set/audio_descriptors"):
    data = pd.read_csv(os.path.join(r"Test_Set/audio_descriptors",file),header=None)
    name = file.rsplit('.',1)
    name = name[0]
    names_test.append(name)
    names.append(name)
    data = data.mean(axis=1)
    audio = audio.append(data,ignore_index=True)
    
audio.insert(0,'filename',names)
audio.set_index('filename',inplace=True)

In [6]:
# And finally the text data

data_text_train = pd.read_csv('Dev_Set/text_descriptors/tdf_idf_dev.csv',header=0)

data_text_train.insert(0,'filename',names_train)
data_text_train.set_index('filename',inplace=True)

data_text_test = pd.read_csv('Test_Set/text_descriptors/tdf_idf_test.csv',header=0)

data_text_test.insert(0,'filename',names_test)
data_text_test.set_index('filename',inplace=True)

text = pd.concat([data_text_train,data_text_test], sort=False, join='outer')
        

In [7]:
# Drop columns that are not used

movies.drop(['released','Website','imdbID','poster','tomatoConsensus','writer','DVD','plot','title','awards'], axis=1, inplace=True)

In [8]:
# All missing values are labelled with N/A in the data, this is changed to NaN to work with numpy. Also the following
# columns need to formatted properly and converted to float.

movies = movies.replace('N/A','NaN')
movies['imdbRating'] = movies['imdbRating'].astype(np.float)
movies['imdbVotes'] = movies['imdbVotes'].replace('[,]','',regex=True).astype(float)
movies['runtime'] = movies['runtime'].replace('[\smin]','',regex=True).astype(float)
movies['rated'] = movies['rated'].replace('NaN','NOT RATED')
movies['year'] = movies['year'].astype(float)
movies['metascore'] = movies['metascore'].astype(float)
movies['tomatoRating'] = movies['tomatoRating'].astype(float)
movies['tomatoUserRating'] = movies['tomatoUserRating'].astype(float)
movies['tomatoMeter'] = movies['tomatoMeter'].astype(float)
movies['tomatoUserMeter'] = movies['tomatoUserMeter'].astype(float)



In [9]:
# All missing values are filled with zeroes, which is in part the same as stated in the paper, for others it is not 
# specified

movies.fillna(0,inplace=True)
visuals.fillna(0,inplace=True)
audio.fillna(0,inplace=True)
text.fillna(0,inplace=True)

In [10]:
# Use the definitions from the dataset paper of what is considered metadata to create the different feature sets used
# in the experiments

metadata = movies[['language','year','genre','country','runtime','rated']]
userrating = movies[['imdbRating','metascore','tomatoRating','tomatoUserRating','tomatoMeter','tomatoUserMeter']]
meta_and_user = movies[['language','year','genre','country','runtime','rated','imdbRating','metascore','tomatoRating',
                       'tomatoUserRating','tomatoMeter','tomatoUserMeter']]

In [11]:
# One hot encode the categeorical features. Here a custom defined function is used, because in many columns there are 
# more than one value in the cell, that are extracted through the function.

import Encoding as enc
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    enc.one_hot_encode(metadata,'language',True)
    enc.one_hot_encode(metadata,'genre',True)
    enc.one_hot_encode(metadata, 'country',True)
    enc.one_hot_encode(metadata, 'rated',True)

metadata.drop(['language','country','rated','genre'], axis=1, inplace=True)

In [12]:
# Same for meta- and user data combined.

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    enc.one_hot_encode(meta_and_user,'language',True)
    enc.one_hot_encode(meta_and_user, 'country',True)
    enc.one_hot_encode(meta_and_user, 'genre',True)
    enc.one_hot_encode(meta_and_user, 'rated',True)

meta_and_user.drop(['language','country','rated','genre'], axis=1, inplace=True)

In [13]:
# Define the classifiers that are used in the classifier pool. The parameters used were the default parameters in the
# scikit-learn version that was most likely used by the paper. The source for that was the corresponding documentation
# on the scikit-learn website.

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)
nc = NearestCentroid(metric='euclidean', shrink_threshold=None)
tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
                              class_weight=None, presort=False)
log = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)
svm = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)
bag = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
                        bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=-1, random_state=0, verbose=0)
rf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=-1, random_state=0, verbose=0, warm_start=False, class_weight=None)
ada = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=0)
gb = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
gauss = GaussianNB()

In [14]:
# Calculate the cross validation score for all the classifiers and print the name, precision, recall and f1-score of 
# that run to perform the classifier selection described in the paper.

from sklearn.model_selection import cross_val_score

algo_pool = [knn,nc,tree,log,svm,bag,rf,ada,gb,gauss]

for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=metadata.loc[names_train,:],y=movies.loc[names_train,:].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=metadata.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=metadata.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')
    print()

KNeighborsClassifier
Precision: 0.5545634920634921, Recall: 0.6166666666666667, f1: 0.5469336219336218

NearestCentroid
Precision: 0.5315873015873015, Recall: 0.5366666666666666, f1: 0.525

DecisionTreeClassifier
Precision: 0.5644047619047619, Recall: 0.5433333333333333, f1: 0.528034188034188

LogisticRegression
Precision: 0.5078571428571429, Recall: 0.5433333333333333, f1: 0.5150427350427351

SVC
Precision: 0.496468253968254, Recall: 0.6933333333333334, f1: 0.5682631257631259

BaggingClassifier
Precision: 0.5261904761904762, Recall: 0.4833333333333334, f1: 0.48704351204351204

RandomForestClassifier
Precision: 0.4978571428571428, Recall: 0.4833333333333333, f1: 0.48464646464646466

AdaBoostClassifier
Precision: 0.48841269841269846, Recall: 0.5266666666666666, f1: 0.48779220779220783

GradientBoostingClassifier
Precision: 0.5429761904761905, Recall: 0.5800000000000001, f1: 0.5425252525252525

GaussianNB
Precision: 0.35, Recall: 0.13333333333333336, f1: 0.18571428571428572



In [15]:
for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=meta_and_user.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=meta_and_user.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=meta_and_user.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')
    print()

KNeighborsClassifier
Precision: 0.6154761904761904, Recall: 0.6733333333333333, f1: 0.6349317349317348

NearestCentroid
Precision: 0.5651190476190476, Recall: 0.6166666666666667, f1: 0.5791686091686092

DecisionTreeClassifier
Precision: 0.4482142857142857, Recall: 0.5433333333333332, f1: 0.48295648795648793

LogisticRegression
Precision: 0.5498412698412699, Recall: 0.5600000000000002, f1: 0.5356943056943057

SVC
Precision: 0.5090873015873015, Recall: 0.73, f1: 0.5953729603729603

BaggingClassifier
Precision: 0.47619047619047616, Recall: 0.4066666666666666, f1: 0.4141192141192141

RandomForestClassifier
Precision: 0.43047619047619046, Recall: 0.43, f1: 0.4232323232323233

AdaBoostClassifier
Precision: 0.4349206349206349, Recall: 0.44666666666666666, f1: 0.4341414141414142

GradientBoostingClassifier
Precision: 0.5854761904761905, Recall: 0.6199999999999999, f1: 0.5896192696192697

GaussianNB
Precision: 0.4, Recall: 0.17, f1: 0.23015873015873015



In [16]:
for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=meta_and_user.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')
    print()

KNeighborsClassifier
Precision: 0.5435714285714286, Recall: 0.6733333333333333, f1: 0.5627994227994229

NearestCentroid
Precision: 0.48166666666666663, Recall: 0.6166666666666667, f1: 0.3773448773448773

DecisionTreeClassifier
Precision: 0.5638095238095236, Recall: 0.5433333333333332, f1: 0.6145609945609944

LogisticRegression
Precision: 0.6105555555555555, Recall: 0.5600000000000002, f1: 0.6330194805194805

SVC
Precision: 0.5434343434343434, Recall: 0.73, f1: 0.6997549019607844

BaggingClassifier
Precision: 0.6163095238095239, Recall: 0.4066666666666666, f1: 0.6207326007326007

RandomForestClassifier
Precision: 0.6029761904761906, Recall: 0.43, f1: 0.6190043290043291

AdaBoostClassifier
Precision: 0.585, Recall: 0.44666666666666666, f1: 0.634054834054834

GradientBoostingClassifier
Precision: 0.590952380952381, Recall: 0.6199999999999999, f1: 0.6367171717171718

GaussianNB
Precision: 0.617936507936508, Recall: 0.17, f1: 0.6833225108225108



In [17]:
for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')
    print()
    

KNeighborsClassifier
Precision: 0.5038095238095238, Recall: 0.5900000000000001, f1: 0.5353515928515928

NearestCentroid
Precision: 0.6266666666666667, Recall: 0.26333333333333336, f1: 0.35930735930735935

DecisionTreeClassifier
Precision: 0.41916666666666663, Recall: 0.48666666666666664, f1: 0.44152763902763903

LogisticRegression
Precision: 0.5471428571428572, Recall: 0.5166666666666666, f1: 0.5134776334776334

SVC
Precision: 0.42507936507936506, Recall: 0.5733333333333333, f1: 0.48578088578088574

BaggingClassifier
Precision: 0.46333333333333326, Recall: 0.37, f1: 0.40296536796536797

RandomForestClassifier
Precision: 0.542142857142857, Recall: 0.4800000000000001, f1: 0.4966522366522367

AdaBoostClassifier
Precision: 0.5516666666666665, Recall: 0.5533333333333333, f1: 0.5473304473304473

GradientBoostingClassifier
Precision: 0.505952380952381, Recall: 0.49666666666666676, f1: 0.4928321678321678

GaussianNB
Precision: 0.5730952380952382, Recall: 0.43, f1: 0.46351037851037846



In [18]:
for i in algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=text.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=text.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=text.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1}')
    print()

KNeighborsClassifier
Precision: 0.5479797979797979, Recall: 1.0, f1: 0.7078431372549019

NearestCentroid
Precision: 0.5313131313131313, Recall: 0.9199999999999999, f1: 0.6711798005915652

DecisionTreeClassifier
Precision: 0.5428571428571429, Recall: 0.5033333333333334, f1: 0.4956099456099456

LogisticRegression
Precision: 0.5479797979797979, Recall: 1.0, f1: 0.7078431372549019

SVC
Precision: 0.5479797979797979, Recall: 1.0, f1: 0.7078431372549019

BaggingClassifier
Precision: 0.5176190476190475, Recall: 0.7166666666666666, f1: 0.5926673326673326

RandomForestClassifier
Precision: 0.42630952380952386, Recall: 0.5, f1: 0.43717365967365973

AdaBoostClassifier
Precision: 0.5033333333333333, Recall: 0.47333333333333333, f1: 0.46584582084582093

GradientBoostingClassifier
Precision: 0.41904761904761906, Recall: 0.5866666666666667, f1: 0.481981351981352

GaussianNB
Precision: 0.556031746031746, Recall: 0.7133333333333334, f1: 0.612969252969253



In [19]:
# Use selfmade Las Vegas Wrapper implementation to do the feature selection. Since the runtime is rather high, the 
# function is commented out by default and in the next cell the resulting features are assigned. For the visual features
# runtime was not feasible. Also the number of iterations was set to 10,000 on the metadata, taking about an hour and to
# 20 on text features also taking already about an hour.

import LVW

#metadata_selected_features = LVW.lvw(metadata.loc[names_train],movies.loc[names_train].loc[:,'goodforairplane'],0.)
#text_selected_features = LVW.lvw(text.loc[names_train],movies.loc[names_train].loc[:,'goodforairplane'],0.,max_tries=20)

#print(f'Meta features: {metadata_selected_features}')
#print(f'Text features {text_selected_features}')

In [20]:
metadata_selected_features = ['runtime','Greek','year','R','Icelandic','Korean']
audio_selected_features = [8]
text_selected_features = ['vs','teaming','heather','la','salman','bromance','academic','explorers','flight','closed','bennett','catskill','decision','make','selma.1','classic','downhill','hundreds','shiftless','police.1','faces','lands','sweethearts','lucky','resents','saginowski','leah','vivian','jobswell','hands','kyla.1','create','solution.1','tortured','siren','levine','beverly.1','unleash','red','quick','ethic','passive','moved','abuse','honor','train','remain','woman','feast','drama','neighbor','mrs','informing','assigned','sven','lights','counterplans','continues','famous.1','carefully','scheming','invite','theologian','conversations','considerable','jacket','mayor.1','montgomery','fallen','spindle','styles','inhabitants','lover.1','dwarf','rebel.1','live.1','maze','fake','mankinds','battle.2','contract','mercenary','cargill','cypher','castorini','taker','buying','0','savings','social','heat','1973','lame','lands.1','seoul','harboring','arends','optimus.1','lizzy','gradeb','mintzplasse','13','flood','jenko','mayera','close','fr','documentary','swan.2','wild.1','mother.1','history','born','expendables.1','legs.1','mn','kindnesses','astronomer','unfortunately.1','amelia','zombiehating','pig','die','gluing','carefree.1','slump','arrival','originally','instead','greenleafs','teams.1','callahan.1','logan','bay','midnight','hills','talented','dating','avidor','augustus','burglar','franco','dassies','great','south.1','cope','activist','developer','recovered','pressures','involving','apologize','examining','maxwell','obstacle','sean','disappeared','sunday','sydney.1','drugs.3','uncertainties','risk','narcotics','macedonia','visit','cart','paddington','fame','ananya.1','label.1','online','subtleties','predestination','entertaining','cooper','fulfilling','dilemma','eurotrip','je','misses','endless','look.2','chipper','hero.2','obvious','athletes','prisoners.1','junkie','karenina','got','producer','rallies','plot','financial','bhai','extinction','billionaire','invaluable','fires','spats','doomed','kurt','citys','result','stag','killer','rinku','oz.1','sex','yves','location','prem.1','makers','relying','theory','mavis.1','mind.2','collapses','opt','anymore','humiliation','sleep','karan','gali','mount','19yearold','restrictive','thomson','constantly','kowalczyk','illegal','thirtyyear','asks','nefarious','english.1','wonders','roxanne.1','kaushalya','epa','south','anxious','beneath','learns','stand','doubletap','christin','enters','completely','televised','companion','bhoothnath','tendencies','numerous','seeks','ring','different','matrix','dismissing','offensive','product','david','karl','amid','treadstone','picking','buell','upscale','proper','shambles','clumsy','stories','distance','prisoner','delhi.1','journey.2','monarchy','isabelle','zhuo','minutes','manic','vikings','regrette','lt','idly','artistic','circumstances','certainty','mildmannered','marines','tropic','games','ward','conflict','colorful','deliver','powers','order','owner.1','lisbeth','effort','initiated','protocol','home.2','prem','sheik.1','work','guile','roman','confident','aside','patient','prison','fault','game','station','jackie','bronson.1','speech','jason','ball.1','lonely','complex','affair','kind','haunted','identity.3','stepson','crosses','overwhelming.1','tyrants','tactics','earlyonset','canadian','turing','crew','jedi','asked','preparation','exploring','format','maintains','threatens','donnies','weiwei','yeshiva.1','patience','captures','dirk','identities','mara','bodies','sister.1','friends.1','actiondrama','idyllic','crete','thing.1','drake.1','heavyweight','professional','invades','workers','roads','called','choice.1','arterton','good.1','immigrantaspiring','cade','money','apart','dog.2','far.1','filming','boats','retireleaving','devastating','21','youths','dress','comrades','mars','titans','source','sale','teacher.1','meet.1','mary','merino','sneaking','kurylenko','big.1','peaceful','vegetable','marcy','pocahontas','nyc','granted','words.1','parties','9yearold','address.1','fundamental','alaska','unimaginable','magnate','taika','prove','murder.2','binoche','chimpanzee','dowager','situation.1','store','nightmares','2003','30s','knee','futuristic','images','souls','hunter','navy','stardom','target','reindeer','diagnosis','trusty','addict','loan','upheavals','punish','position','difficulties','sidecar','devoted','wreckage','100000','neglects','3.1','riding','damian','encounters','story.1','comforts','fr.1','attraction','jungle','land.2','dont.1','balan','ultimate','derk','superior','mccarthy','fuhrer','rhinos','brittle','programmer','entourage','feed','mankind','license','humans.1','competition.1','pierced','love.3','alanadale','boy.1','million','corner','destitute.1','right','hound','character.1','creates','diabolical','babysitter','chalne','rating','respond','durham','way.2','settlement','everyday','shield','linguistics','supported','masterpiece','loved','clubs','living.1','curious','bonding','inspiration','oedipus','poland','gal','avoid','oversharing','sadder','reveals','buttermaker','banking','triangle','bored','surgery','green','decisive','calendar','baker','junior','emphasizes','bury','pows','deinstitutionalization','surroundings','employer.2','treatment','market','suitors','pulls','period','fears.1','couple','experimental','void','filch','foreign','solar','fail','mystic','ambushed','extinction.1','sarajevo','programming.1','powerful.1','recesses','miami.1','withstand','party','asgard','telling','future.1','twists','angelenos','merger','ape','duksoo','school.2','bravery','whales','youngest','tasked','queens','concert','dickie','iowa.1','ensure','contained','twisted','thomas','pat','include.1','lengths','destroy','bartender','sounded','bridge.1','wear','bilbo','100','jill','agencys','shangrila','crash','mother','dictate','colin','waterholes','neglect','notion','migration','brutal.1','dour','osgood','syndicate','invites','people','raise','lawyer','blunt','level','lesson.1','defend','politician','evidence','entertain','bala.1','blamed','inside','alliance','believes','mccall.1','gig','credit','denial','lawsuit','intrigued','abortion.1','important.1','live','knows','fears','boisterous','sarm','mirandas','corruption','map','dutch','represents','bradley','send','tossed','rien','gunnery','consequently','funneling','undercover','assassin.1','vindicated','beirut','silly','shivani','jake','elena','band','convicted','engages','reich','adventure.1','repulses','red.1','mental','adventures','business.1','wife.2','narrative','turn.1','rich.1','raises','mermaid','isabella.1','godlike','director','cricket','upset','dropped','naekyung.1','gluttony','meal','legend.1','immediately','unwilling','supplies','lounges','observe','frenzy','bid','morning.1','bloom','lieutenant','making','dedicated','samuels','known.1','corps','cart.1','frances','tears','dumbo.1','climate','service','fathers','indispensable','flip','policeman','assassination.1','border','answer','portrait','dolphin.1','century','traitor.1','threaten','mikailavich','psychiatrist.1','salesman','teach','president.1','windfall','search.1','hard','backing','trio','died','burned','nath','funeral','sandra','return.2','ignoring','wins','argos','life.1','butch','bolivia','male','woman.2','explored','transported','worlds','lucys','ellie','amanda','strangely','enhanced','russell','1981','outrace','dentists','partylife','partner.2','current','brushes','brabant','z','awards','erik','disasters','french.1','memory','mason','comedyoferrors','separation','lady','intending','expendables','digs','transform','match','fiveman','march','ailment','vietnam','magneto.1','proof','homer','priestly','desert','door','bitten','ronny.1','loft','players','richie','beating','sneaks','dread','guide','palmer','kyles','realms','fraternitys','consume','theres','dent','khoobsurat','hangover','ghost.1','ideally','voices','georges','befriended','lisa.1','corrupted','rab','cammareri','saying','unhappy','leadership','laketown','case.1','indulges','cool','groovy','restaurant.1','came','jules','hunters','brutally','newlyminted','sustain','moms','career.1','murder.1','divided','opposite','decorate','seventh','dragon','custody','roberts.1','selves','coveted','read.1','moose','willful','lestat','2010','options','brittany','date.1','unmapped','leader','industry','banter','austens','penny','intrigue.1','carolina','humans.2','bookie','busboy','play.3','distracted','perceptive','magic','want.1','ruby','participants','sins.1','planned.1','cabbie','films','discovery','lizewski','marvels','death','oil','controlling','turner','lecter','tracy','difference','bucket','otto','reeling','vader','befriends','granddaughter.1','sport.2','seventyseven','fiddle','encourages','hypochondriac','crowe','coup','sent','chronicle','nightcrawler','kenny','ahead','mint','cerebral','hiding','terrorism.1','beverly','inmates','carries','unable','stop.1','mogul','zeus','shopping','morphs','stripes','worms','tie','investor','intense','chaos.2','ensues','ayshe','sports','mast','riches','chosen.1','formation','chris.1','madyanov','order.1','heatedly','voldemort','jamie.1','deniro','baymax.1','gutierrez','participant','purpose.1','insists','quickly','man','obstacles','stood','action','doesnt','blessings','aaron','roy','recourse','madness','buys','man.3','imminent','gangsters','bodine','world','edinburgh','charlie','therapy','family.1','joe','ago.2','krish','toon','mikes','christopher','survived','plans','accepts.1','fiancee','kittens','doug.1','paul','skies.1','opportunistic','creating','bukater','outside','culturally','count','preying','encountering','rest.1','talk','sits','briefly','people.3','evening.1','decided','creditcardjunkies','truly','survive','youth.2','willis','remains','aristocratic','118th','separate','finn','protective.1','officer.1','woods.1','pehli','anarchic','agatha','plays','thijs','outlook.1','problem.2','minute.1','nishapooja','whats','successfully','martin','incident','apparent','andy.2','paris.1','lookout','raymond','ingenuity','murderous','housekeeper.1','mayhem','plussized','mission.3','operation.1','bird','supervisor','rio','lives','anjali','disorder.1','30','unrelenting','chelsea','jealous','debut','mandatory','epidemic','foundation','leopard','minute','streets.2','matthew','loves.1','dr','pad','burlesque','bruce','corrupt','headed','college.1','lie','military.2','lionsgates','itallian','tahir','metroville','1899','animals.1','absolution','revered','c','course.1','subsequently','assault.1','possessions','almasy','obsessive','families.1','nasas','lynd','garage','face.1','lizzie.1','knew','fate','concentration','powhatan','later','ransom','escape.1','dimension','bennetts','desire.1','present.1','boot','longterm','helped','grimm','kenai','paranoia','showing','bruno','money.2','niece','khumba','stuntman','america','eventually','salander','hassan.1','linings','sensations','forget','macks','frank','ill','god.1','bhaus','degenerative','steps','cataclysm','challenge.1','m','collateral','stash','rasputin','karoo','bright','truth','barrier','wedding.1','georgian','deliberately','mukri','monroe','african','horror','kingdom.1','sunil','howland','4195','accumulate','graces','california','hours','government','nearing','towns','escaped','entire','rip','wooed','princess','hollow.1','mouse','conrad','opened','artisttype','couple.1','faith','worse.1','enduring','loony','psychiatric','boston','pretext','stacked','hamilton','sea','earthquake','english','twin.1','cans','experience','earth','rides','success','pair','course','1970s','diversity','sentinel','ship','werewolves','approve','intent','businessman','reader','impromptu','worldclass','grade','tom','cutest','voyage','present','mavis','insight','bukater.1','heroin','omalley','prancing','condemnation','lingling','seek','surviving','july','dad.1','marjorie','job','years','instead.1','boil','shashtras','auditions','thorin','bind','wreck','lights.1','technician','benjie.1','reign','albert.1','fixated','victory','sharma','regulations','icy','consecutive','lounge','ada','resemblance','captured','paying','doc','fancier','convert','sexual','muscles','evolves','impossible','dogs','province','palma','partnerson','peeta','destroyed.2','situation','myriad','develops','iphegenia','balboa','stage','journey.1','abilities','men','violas','9','unite','sport.1','30yearold','aid','follower','cause','dangers','gangster','mind','stylish','hobbit','fills','leroy','tail','attractive','enemyand','liking','rise.1','newcomers','blonde','requires','dragons.1','impaired','chose','humpty.2','parker','temperamental','agent.1','mastering','fred.1','landscape','potential.1','expedition','cousin.1','ailing','horseman','ordered','land.1','monster.1','jacks','room','century.1','scouting','evey','guardians','looks','bounty.1','parents.1','beer.1','rarely','grandfather','fell','regina','granddaughter','maguire','happened.2','calcutta','pirate','tuco','cheerleader','hasbeen','stepping','wenneck','lad','prisoners','bourne','detention','father.3','psychiatrist.2','connection.1','fantasies.1','pittsburgh','dystopian','opinions','serebryakov','wanting','camps','neytiri','love.1','received','pirate.1','accepting','sure','ugly','persians','unauthorized','wannabe','chirpy','hope','power.1','fine','doctor','creditor','unexpected','egypt','repeated','roger','barbaric','clearwater','disapproves','deputy','question','jawani','trip.1','ichabod','selfsacrifice','duksoo.1','quits.1','mass','shut','carl','dog','cafeteria','drifting','visits','boat','simple.1','selected','truths','alis','ally','exactly','movies.2','carta','lizzie.2','dixit','delights','strutting','doug','ii.1','black','detectives.1','viago','princess.1','oakenshield','alexander','think','survival','fairy','modern','leads','extermination','terminal','wasteland','pointy','swan.1','chest','15th','yorks','wizard','bletchley','president','era','profession','vengeance','engagement','river','starring','twice','tore','flattered','jungle.1','george','veronica','gap','scheme.1','galaxy.2','released.1','lure','suicide','events.2','honey','series','press','shouldnt','hardhitting','kahena','lionel','items','job.2','journalist.1','germany','selfimposed','shy','redeem','bernstein','abducted','unspoken','expertise','mix.1','mercurial','easily','reassigned','luxury','valiant','warrior.2','soldiers.1','serial','retaliates','caught.1','bedroom','canfield','diane','streets.1','dwarves.1','calvinist','berthstein','hyperactivity','villa.1','paris.2','advises','ancestors','groundbreaking','county','grows','manager.1','12','good.2','bladewielding','rebels','agency.1','medications','greatness','scarlett','joint','newspaper','route','admitted','brashbutbrilliant','ironically','hercules','senior','rodman','dorm','ears','crevice','known','cooper.1','unfortunately','confines','little','mars.1','street','confirms','percy','wishes','admits','invading','closing','loved.1','establishment','accidentally','sewers','gone.1','evelyn.1','exmobster','selfhelp','aether.1','dominic','skater','lolas','ali','village','atlanta','grandpa','punishing','infertile','eunice','upperclass','party.1','kidnapping.1','crawford.1','kicked','halfgod','oppress','rent','kavya.1','assistant','whereabouts.1','intel','sorceress','inspire','armored','lover','inventor','code','maryland','overcome','bobby','heroes.1','business','altogether','purpose','comedy.1','refugees','elaborate','continuum','entrepreneur','initially','martins','crane','survivor','winnfield','barbara','en','sides.1','chairman','offers','haunt','tow','crafty','manny','lives.1','spiritualist','sport','shanty','starkiron','piaf','disciplinarian','tess','fantastic','streets','star.3','child','rahena','feared','invited','rescue','play','mark.1','cledus','villages','avigdor','mob','sheep','mutually','underworld.1','client','happy','patterns','attain','normalcy','husband.2','outplayed','stateoftheart','solution','guillermo','unleashing','presence','kidnapped','competition','united','killed.2','outcome','married.1','ravaged','jim.2','disaster','acquisitions','nobleman','snow.1','elle','power','sabine','wallace','launching','challenging','max','adapted','wildebeest.1','disguised','natural','fanboy','fueling','culinary','overcame','warrior','ultimately','gregory','according','carrie.1','defense','mantis','visiting','zebra','jimmy','defeats','crushed','introduce','dwindling','dissatisfaction','banker','madison','blown','dork','bears.1','slaves','attracts','importantly','banarasi','problem','original','seals','shoes.1','meera','referred','oldschool','repulse','argentinean','steal','girlfriend','interpreted','usda','bank','fearless.1','remaining','girlfriend.2','care','rat.1','pinto','brutal','jockey','brad','shielded','undermine','prize.1','door.3','nikki.1','question.1','gotten','animation.1','discharges','f','dolly','londoners','sergeant','heirloom','kolya.1','oliver.1','mole','woodland','surrounds','chaudharys','familys','speaks','ancestor','singing','intense.1','aurora','dramatization','darth','movie.2','hillard','succeed.1','literary','hearing','effect','unless','range','sr','spending','familiar.2','trip','equals','parker.1','indians','police','50','unorthodox','mail','sexmad','mutual','daniels','various','duo','weaver','knowing','soundtrack','gambler','fan','bridge.2','review','barbossa','briar.1','bridge','unflinching','islandset','binds','university','mature','richard','regina.2','home','neighbors.1','illness','attitude','destructionbent','peace','regime','battle.1','minds','overcompensates','dome','troupe.1','magician','highspeed','billy','yellow','fall.1','erratic','mr','suffering.1','pryde','rift','oscar','cruise','merciless','adams','juvenile','studio','john.1','judge','trials','following.1','simian','dates','artifact','andromeda','perilous','girls.2','vice','silver','tests','away.2','choices','rossi','farmers','witness','series.1','bitterly','spirit.1','branded','saga','exhibit','12th','satine','befalls','plus.1','hope.2','jmw','commando','light','bringing','ruthless','christopher.1','marlin','reawakening','discovers','villain','wanted','father','dark','powhatan.1','capitol','tammy.2','rebbe','serve','affairs','happens','deeper','beatboxing','singular','ice','bosom','kevin','springfield.1','cracks','friendship.1','cruelty','rebel','sixteen','raj','jailthe','unveiling','burden','entered','brydon','burlesque.1','blazes','society.1','rehearsals','districts.1','kidnapping','wardrobe','albeit','university.1','thought','joes','religious','heartland','humor.1','sung.1','background','bureaucratic','coming','corporation','elders','city.2','doubts','havana.1','backdrop.1','younger.1','golden','meds','kill.1','temptation','occupied','aap','wars','halfmortal','odds.1','airs','talmud.1','quite','curse','lord','endurance','wolf','grandmother.1','classier','kumar','sabine.1','jang','chamber','galvanized','key','universe','banku','bunny','peaceable','marylouis','king.1','summer','time.2','deficit','chart','destroyed.1','needed','grad','direct','wrong','rocky','4','unseen','used','led','hungarian','decades','nsf','skylab','thing','helen','conflicts','spies','rumors','tom.1','reward','poker','budapest','erode','malevolent','widower','war','successor','panem.1','trained','bestselling','fragile','kinky','arise','gotham','stauffenberg','adelaide','exgirlfriends','rajesh.2','uncovered','sully','bigfoot','culminated','set','threatened','xavier','matrix.1','wilde','arts','investigation','goodnatured','lies','foster','comprising','remy','bridetobe','charter','dirty','val','lingling.2','hammond','lounge.1','ostrich','rigs','jackjack','reef','soaks','walking','harvard','matthews','unravel','supernatural','times','committed','tsar','law.1','koda.1','imagined','andersen','sung','teacher','frank.1','scathing','italy.2','gold.1','wrestles','rightful','exes','infatuated','causing','read','houghton','stayathome','greener','havana','assistance','1953','hangout','sings','xmen.1','artist','occasionally','royals','pitt','recounts','reappears','breaks','bandersnatches','johnny.2','megamillions','falcon','word','romantically','identity','blood','physicist','odds','decay','carnahan','hatch','pension','unbeknownst','dewey','suggests','projected','dubbing','best.1','involves','belong','marsellus','managed','loose','exec','matthew.1','mask','1874','subverted','sanctimonious','unlock','sets','faster','klm','vanishes','dramatic','village.1','cause.1','grounds','theyve','egyptian','gave','spider','file','resorts','uncover','forecasting','veronicas','sydney','whilst','costellos','loyal','works','metcalfe','mayor','sangha','procedures','psycho','tests.1','nfls','laser','guided','lower','son','pig.1','abusive','halfchechen','threemonth','neo','twentieth','retreat','pauper','friends','exciting','steffen','cambodian','film','lisa','hop','mastermind','intricate','accident','assigning','tempt','stripes.1','15year','loki','warrior.1','amar','endearing','migrating','underdog','infant','huge','attaches','detritus','thank','look','teenagers.1','history.2','disobeys','loosely','dealing','tends','alternate','fossilized','breadwinning','eye','crap','tracks','mom','thakur','coward','hoeks','fabled','potter','cuca','baseball','key.1','moustachery','tracys','mother.2','teams','hide','dance','stay','receives','tommy.1','pierre','recording','fantastical','confrontation','wedding.2','policemen','faster.1','pretending','primatologist','unwittingly','skeptical','arendelle','allies','anshel.1','craig','hardships','profound','joan','brodys','xiv','refuses','expert','constant','detectives','tension','birthday.1','jazz','animation','briar','convinced','begbie','alqaeda','elude','ohio.1','adversarythe','actions','bikram','diabetes','mechanic.1','cecelia','reality','arjun','1912','intensity','olivier','exhibition','soared','wheel','major','bumpy','ram','tunnel','mistresses','ego.1','attempts','armands','successful','emory','particular','lines','wreckit','dale','designer.1','thelma.1','dom','doctor.1','busan','birthday','carter','serving','pynchon','pepper','bakery','feisty','tournament','amber.1','announce','keldysh','teenaged','usher','demands','quill','scott','bereft','happening','mulan','larry','lonely.1','earth.1','hacker','immortal','antics','remind','line.1','remarried','monsters','dwarves','actor.1','enemies','exodus','just','laurence','twist','jobs','mans','humor','knight','culprit','giant','schuiten','grand','crimefighting','cancer','glimpsed','trade.1','popular','knowledge','week','mystery','twentyfirst','herd','security','jumps','broadway','wargs','vladimir','militia','firewood','simkin','intimate','hysterical','leader.1','wild','incredible','partnership','vampire.1','authors','witherspoon','ruse','temporal','jongseo','crew.1','witch','eat','hercules.1','danielles','president.2','tangled','air.1','charm','toro','truck','routine','awarding','repair','andrea','godfrey','karl.1','cecil','jisshu','danger.1','translate','thors','lee','guys','discretely','importing','overthrow','naming','pooja','bad.3','personality.1','katalins','representative','place.1','darkness','vegas','europe','1984','irresponsible','media','monsters.1','zeus.2','kidnap','james','works.1','james.1','lot','owner.2','grave','rationalizes','receiving','earl','mallory','furys','intensity.1','meets','brilliant','slovenly','perseus','rabelaisian','russ','aether','engine','marv','unwitting','triangle.1','tv.1','pastures','assume','large','provide','misunderstanding','fails','hitchhiking','portland','recovering','connection','carrying','burgeoning','inevitable','working.1','haddas','carrie','forge','operation','lovett','selfconfessed','trash','car','crusaders','sheds','reacquaint','partying','defied','seemingly','magneto','doors','capsize','employer.1','cameron','ostrich.1','lizzie','graduating','isabellas','reviled','troupe','decamp','plainview.1','startling','announces','hand','carolls','ashes','street.1','host','spell.1','tones','non','term','judy','del','alibi','sorority','follow','job.1','governor','insecure','disposal','aaj','ready','stranger.1','space','dearly','west','sisters.2','redemption','mens','history.1','vince','stafford','greater','superior.1','promised','diamond','produced','englishlanguage','looked','stammer','intellect','yearold','preacher','protective','students.2','stop','passed.1','efforts','strengthens','establish','beautiful.1','remove','70s','embarrass','dancers.1','choosing','helpless','ponts','dangerous.1','downfall','unusual','thirties','seeing','disc','jim','weiwei.1','lovers','new.2','bhola.1','reach','woods','robin','indictment','1915','stupidity','joker','clocking','neighborhoods','entertainment.1','everdeen','enthusiasm.1','nasa','blair','shyam','decommissioned','seaside','journals','stir','cremated','misinterpretation','flames','behaviour','meteoric','leaders','police.2','unliked','investigation.1','freds','turing.1','right.1','cambridge','compete','bird.1','process.1','shaikh','document','served','majestic','soldier','door.1','summons','heads','ability.1','handsome','price.1','tennessee','prejudices','possesses','shotgun','oblonsky','slowly','departure','brody','jedediah','oceans','courtship','selfmade','initiated.1','kept','pks','fight.1','retains','bump','dickinson','sick','kim','yeager','funeral.1','diana','anshel','phil.1','quirky.1','alike','12yearolds','daring','simultaneously','exwife','chronicles','boyfriend','nation','miraculous','hi','underbelly','texan','worthwhile','dhabi','tail.1','consciousness','vronsky','wrecks','safe','blondie','cooking','outbursts','condition','poolside','spurge','bros','requested','eli','crime','stunned','sorcerers','1963','tame','percys','slick','class.1','opens','deceits','intersecting','superhero','wouldnt','world.1','denahi','novel','changed','inspired','extraordinary','wrestler','deadly','desires','forcing','long.1','walt.1','inexplicably','record','responsible.2','roses','dancer.1','johnson','funding','simply','edge','hell','cousin','woven','shortterm','13year','highschool','olaf','addiction','cats','task','wb','science','readymade','insurmountable','plan','enemy.1','rivalry.1','sahil','vesper','event','thatcher','destroyed','slave.1','hivaids','baymax','electric','faded','runs','sweat','surface','nearinevitable','durjan','apprentice','v','maleficents','bipolar','directions']


In [21]:
meta_algo_pool = [knn,nc,tree,log,svm,bag,rf,ada,gb]
text_algo_pool = [gauss,knn,svm]
visual_algo_pool = [knn,tree,log,svm,rf,ada,gb]
audio_algo_pool = [log,gb]

In [26]:
from scipy import stats
# Use the algorithms chosen in the paper on the preselected features and print performance again
ttestreport={}

distros=[]
algos=[]
report=[]

for i in meta_algo_pool:
    distros.append(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    f1 = np.mean(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    f1_sd=np.std(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    precision_sd=np.std(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    recall_sd= np.std(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    algo_name=str(i).split('(')[0]
    algos.append(algo_name)
    print(str(i).split('(')[0])
    pool='Meta Algo'
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1},Precision_STD:{precision_sd},Recall_STD:{recall_sd},f1_std:{f1_sd}')
    print()
    report.append([algo_name,pool,precision,recall,f1,precision_sd,recall_sd,f1_sd])
ttests=[]
for index1 in range(0,len(algos)-1):
     distro1=distros[index1]
     algo1=algos[index1]
     for index2 in range(0,len(algos)-1):
         algo2=algos[index2]
         distro2=distros[index2]
         if algo1 is not algo2:
             tvalue,pvalue=stats.ttest_rel(distro1,distro2)
             ttests.append([algo1,algo2,tvalue,pvalue])
for test in ttests:
    tests=test[0:1]
    for t in ttests:
        if t[0]==test[1] and t[1]==test[0]:
            ttests.remove(t)
ttestreport.update({'Meta Algo Pool':ttests})
distros=[]
algos=[]

        
             

KNeighborsClassifier
Precision: 0.5492063492063493, Recall: 0.6, f1: 0.537043512043512,Precision_STD:0.11018891677338796,Recall_STD:0.32795663669996916,f1_std:0.2028830412913345

NearestCentroid
Precision: 0.5315873015873015, Recall: 0.5366666666666666, f1: 0.525,Precision_STD:0.1755593587234077,Recall_STD:0.1320353488022557,f1_std:0.1204736024566746

DecisionTreeClassifier
Precision: 0.39595238095238094, Recall: 0.42000000000000004, f1: 0.39731934731934726,Precision_STD:0.23021741178577784,Recall_STD:0.29522119767312704,f1_std:0.2494816159892371

LogisticRegression
Precision: 0.5273809523809524, Recall: 0.8533333333333333, f1: 0.6472985347985348,Precision_STD:0.06573901076152007,Recall_STD:0.19561867668161617,f1_std:0.10447952454037389

SVC
Precision: 0.5712301587301587, Recall: 0.8533333333333333, f1: 0.6797402597402598,Precision_STD:0.06556901789207933,Recall_STD:0.12927146286443544,f1_std:0.06782078764966885

BaggingClassifier
Precision: 0.5025, Recall: 0.44666666666666666, f1: 0.4

In [27]:
for i in visual_algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    distros.append(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    f1_sd = np.std(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision_sd = np.std(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall_sd = np.std(cross_val_score(estimator=i,X=visuals.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    print(str(i).split('(')[0])
    algo_name=str(i).split('(')[0]
    algos.append(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1},Precision_STD:{precision_sd},Recall_STD:{recall_sd},f1_std:{f1_sd}')
    print()
    pool='Meta Algo'
    report.append([algo_name,pool,precision,recall,f1,precision_sd,recall_sd,f1_sd])
ttests=[]
for index1 in range(0,len(algos)-1):
     distro1=distros[index1]
     algo1=algos[index1]
     for index2 in range(0,len(algos)-1):
         algo2=algos[index2]
         distro2=distros[index2]
         if algo1 is not algo2:
             tvalue,pvalue=stats.ttest_rel(distro1,distro2)
             ttests.append([algo1,algo2,tvalue,pvalue])
for test in ttests:
    tests=test[0:1]
    for t in ttests:
        if t[0]==test[1] and t[1]==test[0]:
            ttests.remove(t)
ttestreport.update({'Visual Algo Pool':ttests})
distros=[]
algos=[]

KNeighborsClassifier
Precision: 0.5435714285714286, Recall: 0.5966666666666667, f1: 0.5627994227994229,Precision_STD:0.11159557082864636,Recall_STD:0.18284784202536636,f1_std:0.12941544262534985

DecisionTreeClassifier
Precision: 0.5638095238095236, Recall: 0.69, f1: 0.6145609945609944,Precision_STD:0.10117674976929014,Recall_STD:0.17,f1_std:0.11675825083215426

LogisticRegression
Precision: 0.6105555555555555, Recall: 0.6933333333333334, f1: 0.6330194805194805,Precision_STD:0.10802520282065868,Recall_STD:0.17243356208503416,f1_std:0.0970080809013451

SVC
Precision: 0.5434343434343434, Recall: 0.9833333333333334, f1: 0.6997549019607844,Precision_STD:0.021921751933546174,Recall_STD:0.04999999999999999,f1_std:0.02863334399842051

RandomForestClassifier
Precision: 0.6029761904761906, Recall: 0.6533333333333332, f1: 0.6190043290043291,Precision_STD:0.08957769718071318,Recall_STD:0.14,f1_std:0.09591308423719447

AdaBoostClassifier
Precision: 0.585, Recall: 0.7166666666666667, f1: 0.63405483

In [29]:
for i in audio_algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    distros.append(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    f1_sd = np.std(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision_sd = np.std(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall_sd = np.std(cross_val_score(estimator=i,X=audio.loc[names_train],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    algo_name=str(i).split('(')[0]
    print(str(i).split('(')[0])
    algos.append(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1},Precision_STD:{precision_sd},Recall_STD:{recall_sd},f1_std:{f1_sd}')
    print()
    pool='Meta Algo'
    report.append([algo_name,pool,precision,recall,f1,precision_sd,recall_sd,f1_sd])
ttests=[]
for index1 in range(0,len(algos)-1):
     distro1=distros[index1]
     algo1=algos[index1]
     for index2 in range(0,len(algos)-1):
         algo2=algos[index2]
         distro2=distros[index2]
         if algo1 is not algo2:
             tvalue,pvalue=stats.ttest_rel(distro1,distro2)
             ttests.append([algo1,algo2,tvalue,pvalue])
for test in ttests:
    tests=test[0:1]
    for t in ttests:
        if t[0]==test[1] and t[1]==test[0]:
            ttests.remove(t)
ttestreport.update({'Audio Algo Pool':ttests})
distros=[]
algos=[]

LogisticRegression
Precision: 0.5471428571428572, Recall: 0.5166666666666666, f1: 0.5134776334776334,Precision_STD:0.21630477364598502,Recall_STD:0.18574175621006708,f1_std:0.1572583899450941

GradientBoostingClassifier
Precision: 0.505952380952381, Recall: 0.49666666666666676, f1: 0.4928321678321678,Precision_STD:0.20106645938205453,Recall_STD:0.21419098435223127,f1_std:0.19765832679173387



In [41]:
for i in text_algo_pool:
    f1 = np.mean(cross_val_score(estimator=i,X=text.loc[names_train].loc[:,text_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    distros.append(cross_val_score(estimator=i,X=metadata.loc[names_train].loc[:,metadata_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=text.loc[names_train].loc[:,text_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=text.loc[names_train].loc[:,text_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    f1 = np.mean(cross_val_score(estimator=i,X=text.loc[names_train].loc[:,text_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='f1',cv=10,n_jobs=-1))
    precision = np.mean(cross_val_score(estimator=i,X=text.loc[names_train].loc[:,text_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='precision',cv=10,n_jobs=-1))
    recall = np.mean(cross_val_score(estimator=i,X=text.loc[names_train].loc[:,text_selected_features],y=movies.loc[names_train].loc[:,'goodforairplane'],
                         scoring='recall',cv=10,n_jobs=-1))
    algo_name=str(i).split('(')[0]
    print(str(i).split('(')[0])
    algos.append(str(i).split('(')[0])
    print(f'Precision: {precision}, Recall: {recall}, f1: {f1},Precision_STD:{precision_sd},Recall_STD:{recall_sd},f1_std:{f1_sd}')
    print()
    pool='Meta Algo'
    report.append([algo_name,pool,precision,recall,f1,precision_sd,recall_sd,f1_sd])
    
ttests=[]
for index1 in range(0,len(algos)-1):
     distro1=distros[index1]
     algo1=algos[index1]
     for index2 in range(0,len(algos)-1):
         algo2=algos[index2]
         distro2=distros[index2]
         if algo1 is not algo2:
             tvalue,pvalue=stats.ttest_rel(distro1,distro2)
             ttests.append([algo1,algo2,tvalue,pvalue])
for test in ttests:
    tests=test[0:1]
    for t in ttests:
        if t[0]==test[1] and t[1]==test[0]:
            ttests.remove(t)
ttestreport.update({'Text Algo Pool':ttests})
distros=[]
algos=[]    
columns1=['Algorithm','Pool','Precision','Recall','F1','Precision STD','Recall STD','F1 STD']
report_frame=pd.DataFrame(report,columns=columns1)
print(report_frame)
columns2=['Pool','Algorithm 1','Algorithm 2','T-Value','p-Value']
temp=[]
for key in ttestreport:
    print(ttestreport[key])
   # t=ttestreport[key]
    #temp.append([key,t[0],t[1],t[2],t[3]])
#ttest_frame=pd.DataFrame(temp,columns=columns2)

GaussianNB
Precision: 0.5726984126984126, Recall: 0.6799999999999999, f1: 0.6103330003330003,Precision_STD:0.20106645938205453,Recall_STD:0.21419098435223127,f1_std:0.19765832679173387

KNeighborsClassifier
Precision: 0.49444444444444446, Recall: 0.31333333333333335, f1: 0.31309523809523815,Precision_STD:0.20106645938205453,Recall_STD:0.21419098435223127,f1_std:0.19765832679173387

SVC
Precision: 0.5479797979797979, Recall: 1.0, f1: 0.7078431372549019,Precision_STD:0.20106645938205453,Recall_STD:0.21419098435223127,f1_std:0.19765832679173387

                     Algorithm       Pool  Precision    Recall        F1  \
0         KNeighborsClassifier  Meta Algo   0.549206  0.600000  0.537044   
1              NearestCentroid  Meta Algo   0.531587  0.536667  0.525000   
2       DecisionTreeClassifier  Meta Algo   0.395952  0.420000  0.397319   
3           LogisticRegression  Meta Algo   0.527381  0.853333  0.647299   
4                          SVC  Meta Algo   0.571230  0.853333  0.67974

In [None]:
knn_meta = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)

nc_meta = NearestCentroid(metric='euclidean', shrink_threshold=None)

tree_meta = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
                              class_weight=None, presort=False)

log_meta = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)

svm_meta = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)

bag_meta = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
                        bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=-1, random_state=0, verbose=0)

rf_meta = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=-1, random_state=0, verbose=0, warm_start=False, class_weight=None)

ada_meta = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=0)

gb_meta = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

gauss_text = GaussianNB()

knn_text = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)

svm_text = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)

knn_vis = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                           metric_params=None,n_jobs=-1)

tree_vis = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
                              class_weight=None, presort=False)

log_vis = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)

svm_vis = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
          cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=0)

rf_vis = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=-1, random_state=0, verbose=0, warm_start=False, class_weight=None)

ada_vis = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=0)

gb_vis = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

log_audio = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=0, solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=-1)

gb_audio = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=0,
                                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

estimators = [('knn_meta',knn_meta),('nc_meta',nc_meta),('tree_meta',tree_meta),('log_meta',log_meta),('svm_meta',svm_meta),
              ('bag_meta',bag_meta),('rf_meta',rf_meta),('ada_meta',ada_meta),('gb_meta',gb_meta),('gauss_text',gauss_text),
              ('knn_text',knn_text),('svm_text',svm_text),('knn_vis',knn_vis),('tree_vis',tree_vis),('log_vis',log_vis),
              ('svm_vis',svm_vis),('rf_vis',rf_vis),('ada_vis',ada_vis),('gb_vis',gb_vis),('log_audio',log_audio),('gb_audio',gb_audio)]

In [None]:
# Use selfmade stacking classifier with all majority voting, label stacking and label and feature stacking to test
# performance on training set using cross validation.

from stacking_classifiers import stacking_classifier_performance_cv

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    res = stacking_classifier_performance_cv(estimators,pd.DataFrame(metadata.loc[names_train,metadata_selected_features].values),pd.DataFrame(visuals.loc[names_train,:].values),pd.DataFrame(text.loc[names_train,text_selected_features].values),pd.DataFrame(audio.loc[names_train,audio_selected_features].values),pd.DataFrame(movies.loc[names_train,'goodforairplane'].values))
    print(res)

In [None]:
# The same for the whole data set, the classifiers were learned on the training set.

from stacking_classifiers import stacking_classifier_performance_on_test_set

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    res = stacking_classifier_performance_on_test_set(estimators,pd.DataFrame(metadata.loc[:,metadata_selected_features].values),pd.DataFrame(visuals.values),pd.DataFrame(text.loc[:,text_selected_features].values),pd.DataFrame(audio.values),pd.DataFrame(movies.loc[:,'goodforairplane'].values),95)
    print(res)
