In [39]:
import numpy as np
import pandas as pd

def blend_csv(csv_paths):
    if len(csv_paths) < 2:
        print("Blending takes two or more csv files!")
        return
    
    # Read the first file
    df_blend = pd.read_csv(csv_paths[0], index_col=0)
    
    # Loop over all files and add them
    for csv_file in csv_paths[1:]:
        df = pd.read_csv(csv_file, index_col=0)
        df_blend = df_blend.add(df)
        
    # Divide by the number of files
    df_blend = df_blend.div(len(csv_paths))

    # Save the blend file
    df_blend.to_csv('blend_with_gbm.csv')
    print(df_blend.head(10))

# Obviously replace this with two or more of your files
blend_csv(['fizznet.csv', 'sub_384.csv', 'sub_512.csv', 'sub_gbm.csv'])

                    ALB       BET       DOL       LAG       NoF     OTHER  \
image                                                                       
img_00005.jpg  0.031250  0.031250  0.031250  0.031250  0.968750  0.031250   
img_00007.jpg  0.919766  0.031250  0.031250  0.059532  0.031250  0.031250   
img_00009.jpg  0.509277  0.031250  0.031250  0.031250  0.246870  0.092620   
img_00018.jpg  0.858484  0.070906  0.045191  0.031250  0.031250  0.065899   
img_00027.jpg  0.224785  0.031250  0.031250  0.031250  0.031250  0.274752   
img_00030.jpg  0.921048  0.031250  0.031250  0.031250  0.062182  0.031250   
img_00040.jpg  0.679315  0.031250  0.031250  0.031250  0.031250  0.059490   
img_00046.jpg  0.635043  0.041827  0.031250  0.031250  0.251255  0.031250   
img_00053.jpg  0.943885  0.031250  0.031250  0.031250  0.031250  0.031250   
img_00071.jpg  0.267057  0.032795  0.031250  0.512851  0.043380  0.067864   

                  SHARK       YFT  
image                              
img

In [18]:
X_train_512 = pd.read_csv('512_train.csv')
X_train_384 = pd.read_csv('384_train.csv')
X_train_fizznet = pd.read_csv('fizznet_train.csv')
y_train = X_train_fizznet['label']
X_train_fizznet = X_train_fizznet.drop('label', axis=1)
X_train = pd.DataFrame()
for col in X_train_512:
    if not col.startswith('image'):
        X_train[col+'_mtl_512'] = X_train_512[col]
    
for col in X_train_384:
    if not col.startswith('image'):
        X_train[col+'_mtl_384'] = X_train_384[col]
    
for col in X_train_fizznet:
    if not col.startswith('image'):
        X_train[col+'_fizznet'] = X_train_fizznet[col]

In [19]:
X_train.head()

Unnamed: 0,ALB_mtl_512,BET_mtl_512,DOL_mtl_512,LAG_mtl_512,NoF_mtl_512,OTHER_mtl_512,SHARK_mtl_512,YFT_mtl_512,ALB_mtl_384,BET_mtl_384,...,SHARK_mtl_384,YFT_mtl_384,ALB_fizznet,BET_fizznet,DOL_fizznet,LAG_fizznet,NoF_fizznet,OTHER_fizznet,SHARK_fizznet,YFT_fizznet
0,0.998631,2.4e-05,3e-05,5.3e-05,0.000394,0.000788,4.7e-05,3.3e-05,0.998118,0.000895,...,4.5e-05,8.4e-05,0.863409,0.046416,0.0004083966,0.0003794101,0.018741,0.044688,0.000108,0.02585
1,0.996498,0.000106,1.2e-05,1.9e-05,0.001397,5e-06,8e-06,0.001955,0.99732,0.000112,...,2.9e-05,0.002098,0.882347,0.006735,0.0001513449,1.30899e-05,0.046912,0.060557,1.1e-05,0.003275
2,0.991656,0.000134,0.000143,1.9e-05,0.007128,0.000206,0.000202,0.000513,0.996961,1.8e-05,...,0.000342,0.001569,0.120173,0.010642,9.325516e-05,1.399238e-06,6.5e-05,0.002904,1.9e-05,0.866103
3,0.955588,0.000282,0.000114,0.000212,0.037758,0.000222,0.000145,0.005679,0.997763,0.000316,...,4.7e-05,0.000166,0.897598,0.00018,1.310661e-05,3.93086e-07,0.099063,2.6e-05,2e-06,0.003116
4,0.999209,4.7e-05,4.2e-05,2.1e-05,0.000166,0.000285,3.7e-05,0.000193,0.998163,4.5e-05,...,0.000263,0.000173,0.996421,0.00042,3.049111e-07,4.301197e-08,5.5e-05,2.6e-05,1e-06,0.003076


In [22]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
print clf.score(X_train, y_train)
print clf.score(X_test, y_test)

1.0
0.988700564972


In [27]:
max_imp = np.argsort(clf.feature_importances_)[::-1]
for i in max_imp:
    print X_train.columns[i], clf.feature_importances_[i]

NoF_fizznet 0.0690036083919
ALB_mtl_384 0.0659014877398
OTHER_mtl_384 0.0599358964147
YFT_mtl_384 0.0586518857658
LAG_mtl_384 0.0464713066853
SHARK_mtl_384 0.0442685283798
DOL_mtl_384 0.0411098496138
BET_mtl_384 0.0402586288851
BET_fizznet 0.0343670198628
DOL_mtl_512 0.0245332405703
SHARK_mtl_512 0.0223956423434
YFT_mtl_512 0.0201875179011
NoF_mtl_384 0.0164252085551
NoF_mtl_512 0.0136180035333
LAG_fizznet 0.0131454366549
BET_mtl_512 0.00703326044801
ALB_fizznet 0.00580840717729
YFT_fizznet 0.00389483202794
DOL_fizznet 0.00289617845487
OTHER_fizznet 0.00270244411674
ALB_mtl_512 0.00206610771373
LAG_mtl_512 0.00196246318237
OTHER_mtl_512 0.00130127859539
SHARK_fizznet 0.000811766986718


In [30]:
X_test_512 = pd.read_csv('sub_512.csv')
X_test_384 = pd.read_csv('sub_384.csv')
X_test_fizznet = pd.read_csv('fizznet.csv')
X_test = pd.DataFrame()
for col in X_test_512:
    if not col.startswith('image'):
        X_test[col+'_mtl_512'] = X_test_512[col]
    
for col in X_test_384:
    if not col.startswith('image'):
        X_test[col+'_mtl_384'] = X_test_384[col]
    
for col in X_test_fizznet:
    if not col.startswith('image'):
        X_test[col+'_fizznet'] = X_test_fizznet[col]

In [33]:
preds = clf.predict_proba(X_test)

In [34]:
preds = np.clip(preds, 0.05, 0.95)

In [37]:
import glob
import os

test_stg1_files = glob.glob('E:/Data/test_stg1/*.jpg')
test_stg2_files = glob.glob('E:/Data/test_stg2/*.jpg')

with open('sub_gbm.csv', 'w') as sub_file:
    sub_file.write('image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT\n')
    for img, pred in zip(sorted(test_stg1_files), preds[:1000]):
        csv = '%s,%s\n' % (os.path.basename(img), ','.join([str(f) for f in pred]))
        sub_file.write(csv)
        
    for img, pred in zip(sorted(test_stg2_files), preds[1000:]):
        csv = 'test_stg2/%s,%s\n' % (os.path.basename(img), ','.join([str(f) for f in pred]))
        sub_file.write(csv)