In [1]:
import numpy as np 
import pandas as pd

In [7]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

SEED = 0

In [8]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target 

In [9]:
from sklearn.model_selection import train_test_split 
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)

# Print shapes to verify

In [11]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train distribution:\n{pd.Series(y_train).value_counts()}")
print(f"y_test distribution:\n{pd.Series(y_test).value_counts()}")

X_train shape: (112, 4)
X_test shape: (38, 4)
y_train distribution:
2    38
0    37
1    37
Name: count, dtype: int64
y_test distribution:
1    13
0    13
2    12
Name: count, dtype: int64


In [15]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

gs_clf = GridSearchCV(DecisionTreeClassifier(),
                      parameters,
                      cv=5,
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train,y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

DecisionTreeClassifier(max_depth=20, max_features='log2', min_samples_leaf=4,
                       min_samples_split=5)
{'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5}
0.9912301587301589


In [16]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 3, 5]
}

gs_clf = GridSearchCV(DecisionTreeClassifier(),
                      parameters,
                      cv=5,
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train,y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

DecisionTreeClassifier(max_depth=5, min_samples_split=5)
{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5}
0.9726785714285715


In [17]:
from sklearn.tree import DecisionTreeClassifier

parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 3, 5]
}

gs_clf = GridSearchCV(
    DecisionTreeClassifier(random_state=SEED),
    parameters,
    cv=5,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)


DecisionTreeClassifier(max_depth=5, min_samples_split=5, random_state=0)
{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5}
0.9726785714285715


In [18]:
from sklearn.neighbors import KNeighborsClassifier

parameters = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs_clf = GridSearchCV(
    KNeighborsClassifier(),
    parameters,
    cv=5,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)


KNeighborsClassifier(metric='euclidean', weights='distance')
{'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.996468253968254


In [20]:
from sklearn.neighbors import KNeighborsClassifier

parameters = {
    'n_neighbors': [2,3,4,5,6,7,8,9,10,11]
}

gs_clf = GridSearchCV(
    KNeighborsClassifier(),
    parameters,
    cv=5,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)


KNeighborsClassifier(n_neighbors=11)
{'n_neighbors': 11}
0.9981944444444444


In [21]:
from sklearn.svm import LinearSVC

parameters = {
    'C': [0.01, 0.1, 1, 10],
    'max_iter': [1000, 5000, 10000],
    'loss': ['hinge', 'squared_hinge']
}

gs_clf = GridSearchCV(
    LinearSVC(random_state=SEED),
    parameters,
    cv=5,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)


LinearSVC(C=0.01, loss='hinge', random_state=0)
{'C': 0.01, 'loss': 'hinge', 'max_iter': 1000}
nan


 nan nan nan nan nan nan]


In [23]:
from sklearn.svm import LinearSVC

parameters = {
    'C': np.logspace(-2,2,5)
}

gs_clf = GridSearchCV(
    LinearSVC(random_state=SEED),
    parameters,
    cv=5,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)


LinearSVC(C=np.float64(0.01), random_state=0)
{'C': np.float64(0.01)}
nan


