In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Load the dataset

In [124]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [125]:
train.head(3)

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0,...,0.007812,0,0.00293,0.00293,0.035156,0,0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0,...,0.000977,0,0.0,0.000977,0.023438,0,0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0,...,0.1543,0,0.005859,0.000977,0.007812,0,0,0.0,0.020508,0.00293


In [126]:
print (train.columns)
print (test.columns)

Index(['id', 'species', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5',
       'margin6', 'margin7', 'margin8',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=194)
Index(['id', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5', 'margin6',
       'margin7', 'margin8', 'margin9',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=193)


# Dataset processing

In [127]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split

In [128]:
def encode(train, test):
    le = LabelEncoder().fit(train.species) 
    labels = le.transform(train.species)           # encode species strings
    classes = list(le.classes_)                    # save column names for submission
    test_ids = test.id                             # save test ids for submission
    
    train = train.drop(['species', 'id'], axis=1)  
    test = test.drop(['id'], axis=1)
    
    return train, labels, test, test_ids, classes

train, labels, test, test_ids, classes = encode(train, test)

In [129]:
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2, random_state=42)
print (len(y_train), len(y_test))

print (len(classes))

792 198
99


# Models

In [130]:
from sklearn.metrics import accuracy_score, log_loss, f1_score, precision_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, BaseNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model.logistic import LogisticRegression

from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

In [132]:
def print_performance(y_true, X_test, model):
    print ('Acc:', accuracy_score(y_true, model.predict(X_test)))
    #print('F1:', f1_score(y_true, model.predict(X_test), labels=labels, average='macro'))
    #print ('Log loss:', log_loss(y_true, model.predict_proba(X_test)))

In [122]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [171]:
model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=8, criterion='entropy', max_features=5)
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)    

model = LinearDiscriminantAnalysis(solver='lsqr')
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)

model = LogisticRegressionCV(max_iter=20000, random_state=42)
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)   

model = xgb.XGBClassifier(learning_rate=0.5, max_depth=5, seed=12, gamma=0.03, subsample=2)
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)

model = GaussianNB()
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)

model = MultinomialNB(alpha=0.002)
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)

model = BernoulliNB(alpha=0.05, binarize=0.002)
model.fit(X_train, y_train)
print_performance(y_test, X_test, model)

Acc: 0.954545454545
Acc: 0.959595959596
Acc: 0.924242424242
Acc: 0.818181818182
Acc: 0.474747474747
Acc: 0.772727272727
Acc: 0.924242424242


In [68]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

params = {'C':[1, 10, 50, 100, 500, 1000, 2000], 'tol': [0.001, 0.0001, 0.005]}
log_reg = LogisticRegression(solver='newton-cg', multi_class='multinomial')
clf = GridSearchCV(log_reg, params, scoring='log_loss', refit='True', n_jobs=8, cv=3)
clf.fit(X_train, y_train)  

print("best params: " + str(clf.best_params_))
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std(), params))
    print(scores)

log_reg = LogisticRegression(solver='newton-cg', multi_class='multinomial', C=2000, tol=0.001)
model.fit(X_train, y_train)
print_performance(y_val, X_val, model)  

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'C': [1, 10, 50, 100, 500, 1000, 2000], 'tol': [0.001, 0.0001, 0.005]},
       pre_dispatch='2*n_jobs', refit='True', scoring='log_loss',
       verbose=0)

In [107]:
m1 = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=8, criterion='entropy', max_features=5)
m2 = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
m3 = KNeighborsClassifier(11)
m4 = RandomForestClassifier(n_estimators=500, n_jobs=8)
m5 = LinearDiscriminantAnalysis(solver='lsqr', tol=2)
m6 = xgb.XGBClassifier(seed=42)

model = VotingClassifier(
    estimators=[('rf', m1), ('knn', m2), ('a', m3)], voting='soft'
)

model = CalibratedClassifierCV(model, cv=4, method='sigmoid')
model.fit(X_train, y_train)
print (log_loss(y_test, model.predict_proba(X_test)))
print_performance(y_test, X_test, model) 

ValueError: y_true and y_pred have different number of classes 85, 99

In [53]:
# m1 = RandomForestClassifier(n_jobs=8)
# m2 = LogisticRegressionCV()
# m3 = KNeighborsClassifier()
# m4 = LinearDiscriminantAnalysis()
# m5 = xgb.XGBClassifier()

# model = VotingClassifier(
#     estimators=[('rf', m1), ('knn', m3), ('lda', m4), ('xgb', m5), ('logreg', m2)], voting='soft'
# )

# params = {
#     'rf__max_features': [3, 4, 5],
#     'rf__criterion': ['entropy', 'gini'],
#     'rf__n_estimators': [370, 200],
#     'logreg__max_iter': [500, 1500],
#     'xgb__n_estimators': [200, 370],
#     'knn__n_neighbors': [3, 6]
# }

# grid = GridSearchCV(estimator=model, param_grid=params, cv=3)
# grid = grid.fit(X_train, y_train)

#model = CalibratedClassifierCV(model, cv=3, method='sigmoid')
#model.fit(X_train, y_train)
#print_performance(y_val, X_val, model)    

# Submission

In [104]:
# Predict Test Set
model.fit(train, labels)
predictions = model.predict_proba(test)

# Format DataFrame
submission = pd.DataFrame(predictions, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()

# Export Submission
submission.to_csv('submission.csv', index = False)
submission.tail()



Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
589,1576,0.003383,0.655793,0.003241,0.00295,0.003584,0.003441,0.003135,0.003415,0.013343,...,0.003277,0.002986,0.003301,0.003479,0.003118,0.003316,0.003455,0.002755,0.003343,0.005013
590,1577,0.003759,0.003782,0.003123,0.003136,0.00315,0.003449,0.003183,0.004109,0.004325,...,0.003311,0.003036,0.003352,0.0038,0.005294,0.004881,0.003488,0.002822,0.003343,0.004574
591,1579,0.003964,0.00434,0.003841,0.003583,0.004456,0.005025,0.003862,0.004005,0.003695,...,0.003986,0.003653,0.003969,0.004279,0.003915,0.003924,0.00428,0.003455,0.004315,0.006375
592,1580,0.005521,0.005524,0.005905,0.005385,0.005336,0.005539,0.00553,0.005659,0.005081,...,0.00611,0.009277,0.005225,0.008075,0.005161,0.005549,0.005693,0.006751,0.005464,0.005563
593,1583,0.004366,0.004658,0.004288,0.004097,0.004186,0.004725,0.004256,0.004466,0.004122,...,0.004394,0.004088,0.004244,0.004616,0.004171,0.004489,0.004557,0.003948,0.004439,0.005539
