In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Load the dataset

In [77]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [78]:
train.head(3)

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293


In [79]:
print (train.columns)
print (test.columns)

Index(['id', 'species', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5',
       'margin6', 'margin7', 'margin8',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=194)
Index(['id', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5', 'margin6',
       'margin7', 'margin8', 'margin9',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=193)


# Dataset processing

In [80]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split

In [81]:
def encode(train, test):
    le = LabelEncoder().fit(train.species) 
    labels = le.transform(train.species)           # encode species strings
    classes = list(le.classes_)                    # save column names for submission
    test_ids = test.id                             # save test ids for submission
    
    train = train.drop(['species', 'id'], axis=1)  
    test = test.drop(['id'], axis=1)
    
    return train, labels, test, test_ids, classes

train, labels, test, test_ids, classes = encode(train, test)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2, random_state=42)
print (len(y_train), len(y_test))

792 198


# Models

In [128]:
from sklearn.metrics import accuracy_score, log_loss, f1_score, precision_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.calibration import CalibratedClassifierCV

In [84]:
def print_performance(y_true, X_test, model):
    print ('Acc:', accuracy_score(y_true, model.predict(X_test)))
    #print ('Log loss:', log_loss(y_true, model.predict_proba(X_test)))

In [85]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [86]:
model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=8, criterion='entropy', max_features=5)
model.fit(X_train, y_train)
print_performance(y_val, X_val, model)    

Acc: 0.937106918239


In [136]:
model = LinearDiscriminantAnalysis(solver='lsqr')
model.fit(X_train, y_train)
print_performance(y_val, X_val, model)   

model = CalibratedClassifierCV(model, cv=3, method='isotonic')
model.fit(X_train, y_train)
print_performance(y_val, X_val, model)   



Acc: 0.930817610063
Acc: 0.0314465408805




In [195]:
from sklearn.ensemble.voting_classifier import VotingClassifier

m1 = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=8, criterion='entropy', max_features=5)
m2 = LinearDiscriminantAnalysis()
m3 = KNeighborsClassifier(6)
m4 = RandomForestClassifier(n_estimators=500, n_jobs=8)
m5 = LinearDiscriminantAnalysis(solver='lsqr', tol=2)

model = VotingClassifier(
    estimators=[('rf', m1), ('lda', m2), ('KNN', m3), ('rf2', m4), ('lda2', m5)], voting='soft'
)
model = CalibratedClassifierCV(model, cv=3, method='sigmoid')
model.fit(X_train, y_train)
print_performance(y_val, X_val, model)    



Acc: 0.981132075472


# Submission

In [196]:
# Predict Test Set
model.fit(train, labels)
predictions = model.predict_proba(test)

# Format DataFrame
submission = pd.DataFrame(predictions, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()

# Export Submission
submission.to_csv('submission.csv', index = False)
submission.tail()



Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
589,1576,0.002841,0.663642,0.003063,0.002671,0.00395,0.003032,0.002909,0.002871,0.033758,...,0.003087,0.002878,0.003021,0.002977,0.002809,0.003148,0.005181,0.002671,0.002753,0.003562
590,1577,0.003349,0.003877,0.00355,0.003426,0.003377,0.003432,0.003407,0.004781,0.005919,...,0.003586,0.003418,0.003793,0.003692,0.005247,0.008564,0.003959,0.003161,0.003257,0.004564
591,1579,0.002688,0.003292,0.003042,0.002659,0.003545,0.003231,0.002912,0.002715,0.002533,...,0.00307,0.002881,0.002948,0.002903,0.002997,0.002777,0.003091,0.002714,0.002765,0.003414
592,1580,0.002388,0.002659,0.002966,0.002659,0.002563,0.002566,0.00277,0.002406,0.002252,...,0.003118,0.004231,0.002539,0.002585,0.00249,0.002589,0.002816,0.003056,0.00243,0.003021
593,1583,0.002996,0.003453,0.003421,0.003033,0.003239,0.00334,0.003303,0.003178,0.002883,...,0.003407,0.003283,0.00324,0.003279,0.003151,0.003163,0.003571,0.003143,0.003148,0.00426
