In [1]:
# Importing required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Importing the Training dataset
data_train = pd.read_csv('train.csv')
data_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [3]:
# Importing the Test dataset

data_test = pd.read_csv('test.csv')
data_test.head()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,4,0.019531,0.009766,0.078125,0.011719,0.003906,0.015625,0.005859,0.0,0.005859,...,0.006836,0.0,0.015625,0.000977,0.015625,0.0,0.0,0.0,0.003906,0.053711
1,7,0.007812,0.005859,0.064453,0.009766,0.003906,0.013672,0.007812,0.0,0.033203,...,0.0,0.0,0.006836,0.001953,0.013672,0.0,0.0,0.000977,0.037109,0.044922
2,9,0.0,0.0,0.001953,0.021484,0.041016,0.0,0.023438,0.0,0.011719,...,0.12891,0.0,0.000977,0.0,0.0,0.0,0.0,0.015625,0.0,0.0
3,12,0.0,0.0,0.009766,0.011719,0.017578,0.0,0.003906,0.0,0.003906,...,0.012695,0.015625,0.00293,0.036133,0.013672,0.0,0.0,0.089844,0.0,0.008789
4,13,0.001953,0.0,0.015625,0.009766,0.039062,0.0,0.009766,0.0,0.005859,...,0.0,0.042969,0.016602,0.010742,0.041016,0.0,0.0,0.007812,0.009766,0.007812


In [4]:
# Label Encoding the species column
labelencoder = LabelEncoder()
data_train["species"] = labelencoder.fit_transform(data_train["species"])
data_train.species.unique()

array([ 3, 49, 65, 94, 84, 40, 54, 78, 53, 89, 98, 16, 74, 50, 58, 31, 43,
        4, 75, 44, 83, 13, 66, 15,  6, 73, 22, 36, 27, 88, 12, 28, 21, 25,
       20, 60, 69, 23, 76, 18, 52,  9, 48, 47, 64, 81, 62, 34, 92, 79, 82,
       32, 35, 72, 71, 11, 51,  5,  8, 37, 97, 33,  1, 59, 56, 57, 29, 93,
       10, 46,  0, 39,  2, 24, 26, 87, 55, 38, 45,  7, 67, 30, 61, 96, 41,
       85, 14, 17, 42, 63, 86, 80, 77, 19, 95, 70, 90, 68, 91])

In [6]:
#Extractcing fatures and target variable and splitting into train and test dataset

X = data_train.iloc[:,1:]
y = data_train["species"]
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state = 42, test_size = 0.20)

# Random Forest Classifier

In [7]:
# Checking best params for Random Forest

model_rf = RandomForestClassifier()

param_grid = { 
    'n_estimators': [100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' : ["gini", "entropy"]
}

CV_rfc = GridSearchCV(estimator=model_rf, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_) 

# Found {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}



{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}


In [8]:
# Fitting and Validating Random Forest

model_rf_final = RandomForestClassifier(criterion = 'gini', max_features = 'sqrt', n_estimators = 200)
model_rf_final.fit(X_train, y_train)

y_predict_rf = model_rf_final.predict(X_test)
print("Random Forest Accuracy Score : ", metrics.accuracy_score(y_predict_rf, y_test))

Random Forest Accuracy Score :  0.9747474747474747


# Decision Tree Classifier

In [9]:
# Checking best params for Decision Tree
model_dt = DecisionTreeClassifier()

param_grid = { 
    'min_samples_split': [2, 3, 4],
    'max_depth': np.arange(2, 10),
    'criterion' : ["gini", "entropy"]
}

CV_dt = GridSearchCV(estimator=model_dt, param_grid=param_grid, cv= 5)
CV_dt.fit(X_train, y_train)
print(CV_dt.best_params_) 

#Found {'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 4}



{'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 4}


In [10]:
# Fitting and Validating Decision Tree

model_dt_final = RandomForestClassifier(criterion = 'entropy', max_depth = 8, min_samples_split = 4)
model_dt_final.fit(X_train, y_train)

y_predict_dt = model_dt_final.predict(X_test)
print("Decision Tree Accuracy Score : ", metrics.accuracy_score(y_predict_dt, y_test))

Decision Tree Accuracy Score :  0.9696969696969697


# Naive Bayes Classifier

In [11]:
# Checking best params for Naive Bayes
model_nb = GaussianNB()

param_grid = { 
    'var_smoothing': np.logspace(0,-9, num=100)
}

CV_nb = GridSearchCV(estimator=model_nb, param_grid=param_grid, cv= 5)
CV_nb.fit(X_train, y_train)
print(CV_nb.best_params_) 

#Found {'var_smoothing': 0.0006579332246575676}



{'var_smoothing': 0.0006579332246575676}


In [12]:
# Fitting and Validating Naive Bayes

model_nb_final = GaussianNB(var_smoothing = 0.0006579332246575676)
model_nb_final.fit(X_train, y_train)

y_predict_nb = model_nb_final.predict(X_test)
print("Naive Bayes Accuracy Score : ", metrics.accuracy_score(y_predict_nb, y_test))


Naive Bayes Accuracy Score :  1.0


# SVM Classifier

In [13]:
# Checking best params for SVM
model_svc = SVC()

param_grid = { 
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

CV_svc = GridSearchCV(estimator=model_svc, param_grid=param_grid, cv= 5)
CV_svc.fit(X_train, y_train)
print(CV_svc.best_params_) 

#Found {'C': 1, 'gamma': 1, 'kernel': 'linear'}



{'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [14]:
# Fitting and Validating SVM

model_svc_final = SVC(C = 1, gamma = 1, kernel = 'linear',probability=True)
model_svc_final.fit(X_train, y_train)

y_predict_svc = model_svc_final.predict(X_test)
print("Support Vector Machine Accuracy Score : ", metrics.accuracy_score(y_predict_svc, y_test))


Support Vector Machine Accuracy Score :  1.0


# Best Model is Naive_Bayes and SVM. Both tied with 100% Accuracy

In [15]:
#Predicting with SVM
y_predict_svc_final = model_svc_final.predict(data_test)
y_predict_svc_final = labelencoder.inverse_transform(y_predict_svc_final)
print(y_predict_svc_final)

['Acer_Palmatum' 'Acer_Rubrum' 'Acer_Saccharinum' 'Alnus_Rubra'
 'Alnus_Sieboldiana' 'Betula_Austrosinensis' 'Castanea_Sativa'
 'Cornus_Controversa' 'Cornus_Macrophylla' 'Eucalyptus_Glaucescens'
 'Ilex_Aquifolium' 'Liriodendron_Tulipifera' 'Magnolia_Heptapeta'
 'Morus_Nigra' 'Populus_Adenopoda' 'Populus_Nigra' 'Prunus_Avium'
 'Quercus_Agrifolia' 'Quercus_Alnifolia' 'Quercus_Brantii'
 'Quercus_Chrysolepis' 'Quercus_Coccinea' 'Quercus_Dolicholepis'
 'Quercus_Hartwissiana' 'Quercus_Infectoria_sub' 'Quercus_Nigra'
 'Quercus_Pontica' 'Quercus_Rhysophylla' 'Quercus_Semecarpifolia'
 'Quercus_x_Hispanica' 'Salix_Fragilis' 'Salix_Intergra' 'Sorbus_Aria'
 'Tilia_Platyphyllos' 'Tilia_Tomentosa' 'Ulmus_Bergmanniana'
 'Viburnum_Tinus' 'Viburnum_x_Rhytidophylloides' 'Zelkova_Serrata'
 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata'
 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata'
 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata'


In [16]:
#Predicting with Naive Bayes

y_predict_nb_final = model_nb_final.predict(data_test)
y_predict_nb_final = labelencoder.inverse_transform(y_predict_nb_final)
print(y_predict_nb_final)

['Acer_Palmatum' 'Acer_Rubrum' 'Acer_Saccharinum' 'Alnus_Rubra'
 'Alnus_Sieboldiana' 'Betula_Austrosinensis' 'Castanea_Sativa'
 'Cornus_Controversa' 'Cornus_Macrophylla' 'Eucalyptus_Glaucescens'
 'Ilex_Aquifolium' 'Liriodendron_Tulipifera' 'Magnolia_Heptapeta'
 'Morus_Nigra' 'Populus_Adenopoda' 'Populus_Nigra' 'Prunus_Avium'
 'Quercus_Agrifolia' 'Quercus_Alnifolia' 'Quercus_Brantii'
 'Quercus_Chrysolepis' 'Quercus_Coccinea' 'Quercus_Dolicholepis'
 'Quercus_Hartwissiana' 'Quercus_Infectoria_sub' 'Quercus_Nigra'
 'Quercus_Pontica' 'Quercus_Rhysophylla' 'Quercus_Semecarpifolia'
 'Quercus_x_Hispanica' 'Salix_Fragilis' 'Salix_Intergra' 'Sorbus_Aria'
 'Tilia_Platyphyllos' 'Tilia_Tomentosa' 'Ulmus_Bergmanniana'
 'Viburnum_Tinus' 'Viburnum_x_Rhytidophylloides' 'Zelkova_Serrata'
 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata'
 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata'
 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata' 'Zelkova_Serrata'
