In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import tree
from sklearn import ensemble
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
import StringIO
import time
import pickle
from sklearn.datasets import make_multilabel_classification
from sklearn.cross_decomposition import CCA

In [2]:
# Multi Label Model 
print('Loading data...')

with open('full_dataset.pickle', 'rb') as handle:
    dataset = pickle.load(handle)
    
print(len(dataset['train_x']), 'train sequences')
print(len(dataset['test_x']), 'test sequences')
 
num_classes = len(dataset['labels'])
print(num_classes, 'classes.')

print('Data loaded.')

Loading data...
(5120, 'train sequences')
(1704, 'test sequences')
(8, 'classes.')
Data loaded.


In [3]:
#found a nice example of extracting dominate color palette from image, modify a bit for our project
#http://stackoverflow.com/a/16216866/190597 (Jaime)
#http://stackoverflow.com/a/16840350/190597 (Jaime)
def palette(img):
    arr = np.asarray(img)
    palette, index = np.unique(asvoid(arr).ravel(), return_inverse=True)
    palette = palette.view(arr.dtype).reshape(-1, arr.shape[-1])
    count = np.bincount(index)
    order = np.argsort(count)
    return palette[order[::-1]]

In [4]:
#generate a pallatte score for the top 5 dominate color in this image
#calculate color intensity for top 5 dominate color
#calcuate average red, blue, green scale
def getPaletteScore(img):
    palettescore = np.sum(palette(img)[:5], axis = 1)
    avgRBG = np.mean(palette(img)[:5], axis = 0)
    tempRBG = [0.0, 0.0, 0.0] + avgRBG[:3]
    return np.concatenate((palettescore[:5], tempRBG))
def asvoid(arr):
    arr = np.ascontiguousarray(arr)
    return arr.view(np.dtype((np.void, arr.dtype.itemsize * arr.shape[-1])))

In [5]:
train_vals = np.array([])
for row in dataset['train_x'].iterrows():
    train_vals = np.concatenate((train_vals, getPaletteScore(row[1]['poster'])))
train_x = train_vals.reshape(dataset['train_x'].shape[0], 8)
train_x.shape

(5120, 8)

In [6]:
test_vals = np.array([])
for row in dataset['test_x'].iterrows():
    test_vals = np.concatenate((test_vals, getPaletteScore(row[1]['poster'])))
test_x = test_vals.reshape(dataset['test_x'].shape[0], 8)
test_x.shape

(1704, 8)

In [7]:
train_y = dataset['train_y']
test_y = dataset['test_y']

In [8]:
n_trees = np.arange(1, 101, 5)  
depths = np.arange(1, 15)   

# To keep track of the best model
best_score_mml = 1e15
best_trees = 0
best_depth = 0
# Run grid search for model with 5-fold cross validation
print '3-fold cross validation:'

for trees in n_trees:
    print('Evaluating trees : ' + str(trees))
    for depth in depths:
        # Cross validation for every experiment
        k_folds_mml = KFold(train_x.shape[0], n_folds=3, shuffle=True)
        scores_mml = []
        for train_indices, validation_indices in k_folds_mml:
            # Generate training data
            x_train_cv_mml = train_x[train_indices]
            y_train_cv_mml = train_y[train_indices]
            # Generate validation data
            x_validate_mml = train_x[validation_indices]
            y_validate_mml = train_y[validation_indices]
            
            # Fit random forest on training data
            model_mml = OneVsRestClassifier(ensemble.RandomForestClassifier(n_estimators=trees, max_depth=depth))
            model_mml.fit(x_train_cv_mml, y_train_cv_mml)
            # Score on validation data
            y_pred_cv_mml = model_mml.predict(x_validate_mml)
            scores_mml.append(hamming_loss(y_validate_mml, y_pred_cv_mml))
        
        # Record and report accuracy
        average_score_mml = np.mean(scores_mml)
        #print "Trees:", trees, "Depth:", depth, "Loss:", average_score_mml
        
        # Update our record of the best parameters see so far
        if average_score_mml < best_score_mml:
            best_score_mml = average_score_mml
            best_trees = trees
            best_depth = depth
    print('Best trees : ' + str(best_trees) + ' Best Depth : ' + str(best_depth))

3-fold cross validation:
Evaluating trees : 1
Best trees : 1 Best Depth : 3
Evaluating trees : 6
Best trees : 6 Best Depth : 2
Evaluating trees : 11
Best trees : 11 Best Depth : 2
Evaluating trees : 16
Best trees : 16 Best Depth : 5
Evaluating trees : 21
Best trees : 16 Best Depth : 5
Evaluating trees : 26
Best trees : 16 Best Depth : 5
Evaluating trees : 31
Best trees : 16 Best Depth : 5
Evaluating trees : 36
Best trees : 16 Best Depth : 5
Evaluating trees : 41
Best trees : 16 Best Depth : 5
Evaluating trees : 46
Best trees : 16 Best Depth : 5
Evaluating trees : 51
Best trees : 16 Best Depth : 5
Evaluating trees : 56
Best trees : 16 Best Depth : 5
Evaluating trees : 61
Best trees : 16 Best Depth : 5
Evaluating trees : 66
Best trees : 16 Best Depth : 5
Evaluating trees : 71
Best trees : 16 Best Depth : 5
Evaluating trees : 76
Best trees : 16 Best Depth : 5
Evaluating trees : 81
Best trees : 16 Best Depth : 5
Evaluating trees : 86
Best trees : 16 Best Depth : 5
Evaluating trees : 91
Bes

In [9]:
# Fit model on entire train set using chosen number of trees and depth
model_mml = OneVsRestClassifier(ensemble.RandomForestClassifier(n_estimators=best_trees, max_depth=best_depth))
model_mml.fit(train_x, train_y)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=16, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=1)

In [10]:
print 'Chosen number of trees, depth:', best_trees, ',', best_depth
print 'Test F1 Score:', f1_score(test_y, model_mml.predict(test_x), average='samples')
print 'Hamming Loss:', hamming_loss(test_y, model_mml.predict(test_x))

Chosen number of trees, depth: 16 , 5
Test F1 Score: 0.360680751174
Hamming Loss: 0.206866197183


  'precision', 'predicted', average, warn_for)


In [11]:
# Merge the predictions back into our pickled dataset for future use.
with open('full_dataset.pickle', 'rb') as handle:
    dataset = pickle.load(handle)
    
dataset['Metadata_Train_Probabilities'] = model_mml.predict_proba(train_x)
dataset['Metadata_Test_Probabilities'] = model_mml.predict_proba(test_x)

with open('full_dataset.pickle', 'wb') as handle:
    pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)