In [90]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

# Calculate multiclass logloss

In [71]:
sample_sub = "submissions/sampleSubmission.csv"
sample_sub_df = pd.read_csv(sample_sub)

def normalize(row, epsilon=1e-15):
    
    row = row / np.sum(row)
    row = np.maximum(epsilon, row)
    row = np.minimum(1 - epsilon, row)
    
def logloss_mc(y_true, y_probs):
    
    # Normalize probability data frame
    y_probs.apply(normalize, axis=1)
        
    log_vals = []
        
    for i, y in enumerate(y_true):
        c = int(y.split("_")[1])
        log_vals.append(- np.log(y_probs.iloc[i,c - 1]))
        
    print(log_vals)
        
    return -np.mean(log_vals)
        
        

# Split into train and test set

In [83]:
import random

def train_test_split(df, leave_out_size = 0.2):
    
    num_samples = int(len(df) * (1 - leave_out_size))
    sample_rows = random.sample(df.index, num_samples)
    
    train = df.ix[sample_rows].drop(["target"], axis = 1)
    test = df.ix[-sample_rows].drop(["target"], axis = 1)
    train_labels = df.ix[sample_rows].target
    test_labels = df.ix[-sample_rows].target
    return train, test, train_labels, test_labels 

In [91]:
def load_train_data(df, train_size=0.8, percentage=1, standardize=False):

    if standardize:
        X = df.drop(['id', 'target'], axis=1).apply(func=log_normalize, axis=1)
        X = StandardScaler().fit_transform(X)
        X = pd.DataFrame(X)
        X.loc[:, 'id'] = df.loc[:, 'id']
        X.loc[:, 'target'] = df.loc[:, 'target']
        df = X
        
    num_samples = int(len(df) * percentage)
    
    sample_rows = random.sample(df.index, num_samples)

    df_sampled = df.ix[sample_rows]
    
    X_train, X_valid, y_train, y_valid = train_test_split(df_sampled.drop(['id', 'target'], axis = 1),
                                                          df_sampled.target, 
                                                          train_size=train_size)

    return (X_train.astype(float), X_valid.astype(float),
            y_train.astype(str), y_valid.astype(str))

In [100]:
X_train, X_valid, y_train, y_valid = load_train_data(pd.read_csv("data/train.csv"))

In [44]:
sub = pd.read_csv("submissions/polishedAndBlended.csv")

# Cross-Validation

## Get true values:

In [50]:
y_true = pd.read_csv("data/train.csv").loc[:,"target"]

In [43]:
sub.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0.0,0.135398,0.0,0.864602,0,0.0,0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0,0.603785,0,0.396215,0.0
2,3,0.0,0.0,0.0,0.0,0,1.0,0,0.0,0.0
3,4,0.0,0.573719,0.426281,0.0,0,0.0,0,0.0,0.0
4,5,0.164559,0.0,0.0,0.0,0,0.0,0,0.0,0.835441


In [72]:
logloss_mc(y_true, sub.iloc[:,1:])

[inf, inf, inf, inf, 1.8044851650585982, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, 2.1198930215241316, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, 2.0679767449024058, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, 1.0546265556284316, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, 1.2701543137596758, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, 1.6219641345032814, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, 1.3246187578448225, inf,

-inf