## Model Selection using Scikit Learn

### Loading in Datasets

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
le = LabelEncoder()
iris['label'] = le.fit_transform(iris['label'])
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
X = np.array(iris.drop(['label'], axis=1))
y = np.array(iris['label'])

### Importing Models

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = [
    ('LR', LogisticRegression()),
    ('NB', GaussianNB()),
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('DT', DecisionTreeClassifier()),
]

### Comparing Models

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

for name, model in models:
        clf = model
        clf.fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        print(name, accuracy)

LR 0.9666666666666667
NB 0.9666666666666667
SVM 1.0
KNN 0.9833333333333333
DT 0.9666666666666667


### Cross Validation

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5)
scores

array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ])

In [9]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.96 (+/- 0.08)


In [10]:
from sklearn import metrics

scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([1.        , 0.96658312, 0.93333333, 0.89769821, 1.        ])

In [11]:
 from sklearn.model_selection import cross_validate # allows us to use multiple scoring metrics
#from sklearn.metrics import recall_score

scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, 
                          return_train_score=False) # cv can also return train score but we set it to false
scores

{'fit_time': array([0.00155592, 0.00092292, 0.00094986, 0.00088692, 0.00086713]),
 'score_time': array([0.00252581, 0.00131607, 0.00201535, 0.00148106, 0.00140786]),
 'test_precision_macro': array([1.        , 0.96969697, 0.93333333, 0.92307692, 1.        ]),
 'test_recall_macro': array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ])}

In [12]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=3, shuffle=True, random_state=42)

for train, test in kfold.split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    print(X_train[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.4 3.9 1.7 0.4]]
[[4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [4.6 3.4 1.4 0.3]]
[[5.1 3.5 1.4 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]]
