# AST 7939 Week 3

## hyperparameter optimization, cross validation, evaluation metric

### Let's load the iris data and run cross validation.

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate

iris = load_iris()
model = DecisionTreeClassifier(max_depth=4, random_state=0)

scores = cross_val_score(model, iris.data, iris.target, cv=5)
print("Cross-validation scores: {}".format(scores))
print("Mean cross-validation scores: {:.3f}".format(scores.mean()))
print("Standard deviation cross-validation scores: {:.3f}".format(scores.std()))

### Let's split the data.

In [None]:
from sklearn.model_selection import train_test_split

X = iris.data
y = iris.target

# If you want to reproduce the result, make sure you use the same random_state value.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### We can use for loops for hyperparameter optimization.

In [None]:
import numpy as np

max_depth = np.arange(10)+1
criterion = ['gini','entropy']

best_score = 0

for i in max_depth:
    for j in criterion:
        model = DecisionTreeClassifier(max_depth=i, criterion=j, random_state=0)
        score = cross_val_score(model, X_train, y_train, cv=5)
        score = np.mean(score)
        if score > best_score:
            best_score = score
            best_parameters = {'max_depth': i, 'criterion': j}
    
print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

### But we can take advantage of built-in modules.

In [None]:
from sklearn.model_selection import GridSearchCV

# Grid of parameters in a dictionary 
param_grid = {'max_depth': np.arange(10)+1,
              'criterion': ['gini','entropy']}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid, cv=5, 
                           return_train_score=True, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print("Test score: {:.2f}".format(grid_search.score(X_test, y_test)))

In [None]:
help(GridSearchCV)

### Let's make a heatmap.

In [None]:
import pandas as pd
import mglearn
# If you don't have mglearn install, pip install mglearn.

results = pd.DataFrame(grid_search.cv_results_)
scores = np.array(results.mean_test_score).reshape(2,10)

mglearn.tools.heatmap(scores, xlabel='max_depth', xticklabels=param_grid['max_depth'],
                     ylabel='criterion', yticklabels=param_grid['criterion'], cmap='viridis')

### What if we need feature scaling?

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# We use a "pipeline"
# Sequentially apply a list of transforms and a final estimator. 
# Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. 
# The final estimator only needs to implement fit.
# See https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

pipe = Pipeline([
    ('sc', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {'knn__n_neighbors': np.arange(10)+1,
              'knn__weights': ['uniform','distance']}

grid_search = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print("Test score: {:.2f}".format(grid_search.score(X_test, y_test)))

### Let's load the SDSS data we used for homework #1 and test parallelization.

In [None]:
import pandas as pd 
import numpy as np

data = pd.read_csv('SDSS.csv')
data

In [None]:
# X for an array containing features (i.e., colors)
X = np.array([data['u'], data['g'], data['r'], data['i'], data['z']]).T

# y for an array containing labels (i.e., galaxies or quasars)
y = np.expand_dims(data['class'], axis=1)

#If you want to reproduce the result, make sure you use the same random_state value.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



### Hyperparameter optimization with 1 CPU core

In [None]:
from sklearn.model_selection import GridSearchCV
import time

tstart = time.time()

param_grid = {'max_depth': np.arange(10)+1,
              'criterion': ['gini','entropy']}

# By defaults, sklearn's GridSearchCV will use stratified k-fold for classification problems.
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True, 
                           verbose=3)
grid_search.fit(X_train, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print("Test score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Elapsed time: {:.3f}".format(time.time()-tstart) + " seconds")

### Now the same task with 2 CPU cores

In [None]:
tstart = time.time()

param_grid = {'max_depth': np.arange(10)+1,
              'criterion': ['gini','entropy']}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True, 
                           verbose=3, n_jobs=2)
grid_search.fit(X_train, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print("Test score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Elapsed time: {:.3f}".format(time.time()-tstart) + " seconds")

## Imbalanced data and evaluation metric

For all available sklearn's evaluation meterc, see https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt

# generate imbalanced dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.9], flip_y=0, random_state=2)

# summarize class distribution
counter = Counter(y)
print(counter)

# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = np.where(y == label)[0]
    plt.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label), alpha=0.5, marker='.')

plt.xlabel('feature 1')
plt.ylabel('feature 2')
plt.legend(loc='lower right')

In [None]:
help(make_classification)

### Split the data.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
param_grid = {'max_depth': np.arange(10)+1,
              'criterion': ['gini','entropy']}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid, cv=5, 
                           return_train_score=True, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print("Test score: {:.2f}".format(grid_search.score(X_test, y_test)))

### We got 97% accuracy! But let's have a look at the result (make this as a habit). 

### This is the true class.

In [None]:
counter = Counter(y_test)
print(counter)

# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = np.where(y_test == label)[0]
    plt.scatter(X_test[row_ix, 0], X_test[row_ix, 1], label=str(label), alpha=1, marker='.')

plt.xlabel('feature 1')
plt.ylabel('feature 2')
plt.legend(loc='lower right')

### This is our best model. Are we good with it?

In [None]:
## ** operator takes a dictionary and unpack it into keaword arguments in a function.
model = DecisionTreeClassifier(**grid_search.best_params_, random_state=0)
model.fit(X_train, y_train)

prediction = model.predict(X_test)

counter = Counter(prediction)
print(counter)

# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = np.where(prediction == label)[0]
    plt.scatter(X_test[row_ix, 0], X_test[row_ix, 1], label=str(label), alpha=1, marker='.')

plt.xlabel('feature 1')
plt.ylabel('feature 2')
plt.legend(loc='lower right')


### Although we got a pretty high accuracy, this may not necesarily the result we wanted.

### Let's check out different evaluation metric

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, model.predict(X_test)))

### TODO: I'd like to minimize false negatives. Let's use "recall" as the evaluation metric.

### TODO: Make a plot showing your prediction. Is it any better?