In [27]:
import pandas as pd
import numpy as np
#Encoding
from sklearn.preprocessing import LabelEncoder
#Split the dataset
from sklearn.cross_validation import train_test_split
#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#Parameter Search
from sklearn.grid_search import GridSearchCV

  from collections import Mapping, namedtuple, Sized


# Load data

Dataset: https://archive.ics.uci.edu/ml/datasets/iris

In [4]:
input_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)

In [5]:
input_data.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Inspect data

In [6]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
0    150 non-null float64
1    150 non-null float64
2    150 non-null float64
3    150 non-null float64
4    150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


# Model development

## Encode target value

In [9]:
#get column names
list(input_data)

[0, 1, 2, 3, 4]

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [10]:
#label encode target
categorical_column = [4]
data_encoded = input_data.copy(deep=True)
lab_enc = LabelEncoder()
for col in categorical_column:
        data_encoded[col] = lab_enc.fit_transform(input_data[col])
        le_name_mapping = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))
        print('Feature', col)
        print('mapping', le_name_mapping)

Feature 4
mapping {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}


In [13]:
predictors = data_encoded.drop([4], axis=1)
target = data_encoded[[4]]
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)

## Model 1: Logistic Regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [17]:
cls = LogisticRegression()
cls.fit(X_train, y_train)
pred = cls.predict(X_test)
target_names = ['yes', 'no']
print('---------------------')
print("Classification report")
print('---------------------')
print(classification_report(y_test, pred, target_names=target_names))

---------------------
Classification report
---------------------
             precision    recall  f1-score   support

        yes       1.00      1.00      1.00        10
         no       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        30



# Model 2: Decision Tree

 https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [19]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
target_names = ['yes', 'no']
print('---------------------')
print("Classification report")
print('---------------------')
print(classification_report(y_test, pred, target_names=target_names))

---------------------
Classification report
---------------------
             precision    recall  f1-score   support

        yes       1.00      1.00      1.00        10
         no       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        30



# Model 3: Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [23]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train) 
pred = clf.predict(X_test)
target_names = ['yes', 'no']
print('---------------------')
print("Classification report")
print('---------------------')
print(classification_report(y_test, pred, target_names=target_names))

---------------------
Classification report
---------------------
             precision    recall  f1-score   support

        yes       1.00      1.00      1.00        10
         no       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        30



  
  .format(len(labels), len(target_names))


# Model 4: Adaboost

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

In [25]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train) 
pred = clf.predict(X_test)
target_names = ['yes', 'no']
print('---------------------')
print("Classification report")
print('---------------------')
print(classification_report(y_test, pred, target_names=target_names))

---------------------
Classification report
---------------------
             precision    recall  f1-score   support

        yes       1.00      1.00      1.00        10
         no       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        30



  y = column_or_1d(y, warn=True)
  .format(len(labels), len(target_names))


# Search parameter for decision tree

Grid Search:  https://scikit-learn.org/0.15/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV

Grid search metric name: https://scikit-learn.org/stable/modules/model_evaluation.html

Will normal metric names like 'precision' or 'recall' work for multiclass? ---- No.

It should be used along with `micro`. Reason for using micro along with metric:

https://stackoverflow.com/questions/50290273/gridsearchcv-representation-of-each-class-in-each-part-of-the-dataframe

In [38]:
y_train.shape
y_train_arr = np.ravel(y_train[4])
y_train_arr.shape
param_test1 = {
 'n_estimators': [10, 20, 30, 40, 50]
}

gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=5), 
 param_grid = param_test1, scoring='recall_micro',n_jobs=4,iid=False, cv=5)

#
gsearch1.fit(X_train,y_train_arr)
#
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.94997, std: 0.06124, params: {'n_estimators': 10},
  mean: 0.95830, std: 0.04566, params: {'n_estimators': 20},
  mean: 0.93330, std: 0.07265, params: {'n_estimators': 30},
  mean: 0.94164, std: 0.05652, params: {'n_estimators': 40},
  mean: 0.93330, std: 0.07265, params: {'n_estimators': 50}],
 {'n_estimators': 20},
 0.9583043478260869)