## Setup

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import sklearn

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

## Email data

In [9]:
from sklearn.linear_model import LogisticRegression

email = pd.read_csv(
  'https://sta663-sp22.github.io/slides/data/email.csv'
)[ [
  'spam', 'exclaim_mess', 'format', 'num_char', 'line_breaks', 'number'
] ]

email_dc = pd.get_dummies(email)
email_dc


y = email_dc.spam
X = email_dc.drop('spam', axis=1)

m = LogisticRegression(fit_intercept = False).fit(X, y)

### Demo 1 - DescisionTreeClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
tree_gs = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid = {
        "criterion": ["gini", "entropy"],
        "max_depth": range(2, 11)
    },
    cv = KFold(10, shuffle=True, random_state=1234),
    scoring = "roc_auc",
    n_jobs = 4
).fit(
    X, y
)


In [13]:
print(tree_gs.best_estimator_)
print(tree_gs.best_score_)

DecisionTreeClassifier(criterion='entropy', max_depth=5)
0.8232005546702693


In [14]:
for p, s in zip(tree_gs.cv_results_["params"], tree_gs.cv_results_["mean_test_score"]):
    print(p, "Score:", s)

{'criterion': 'gini', 'max_depth': 2} Score: 0.7261528786045157
{'criterion': 'gini', 'max_depth': 3} Score: 0.7827775234240709
{'criterion': 'gini', 'max_depth': 4} Score: 0.8037480167471692
{'criterion': 'gini', 'max_depth': 5} Score: 0.8084495898099584
{'criterion': 'gini', 'max_depth': 6} Score: 0.811695318467035
{'criterion': 'gini', 'max_depth': 7} Score: 0.7976692231180442
{'criterion': 'gini', 'max_depth': 8} Score: 0.7767128257515082
{'criterion': 'gini', 'max_depth': 9} Score: 0.7728774183082037
{'criterion': 'gini', 'max_depth': 10} Score: 0.7490231219372665
{'criterion': 'entropy', 'max_depth': 2} Score: 0.771310969315679
{'criterion': 'entropy', 'max_depth': 3} Score: 0.8099438872562372
{'criterion': 'entropy', 'max_depth': 4} Score: 0.8157233519386212
{'criterion': 'entropy', 'max_depth': 5} Score: 0.8232005546702693
{'criterion': 'entropy', 'max_depth': 6} Score: 0.8131446116176873
{'criterion': 'entropy', 'max_depth': 7} Score: 0.793978863987547
{'criterion': 'entropy',

In [15]:
confusion_matrix(y, tree_gs.best_estimator_.predict(X))

array([[3544,   10],
       [ 286,   81]])

In [18]:
print( classification_report(y, tree_gs.best_estimator_.predict(X)) )

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3554
           1       0.89      0.22      0.35       367

    accuracy                           0.92      3921
   macro avg       0.91      0.61      0.66      3921
weighted avg       0.92      0.92      0.90      3921



### Demo 2 - SVC
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html|

In [19]:
from sklearn.svm import SVC

In [21]:
svc_pipe = make_pipeline(
    StandardScaler(),
    SVC()
)


svc_gs = GridSearchCV(
    svc_pipe,
    param_grid = [
        {"svc__kernel": ["rbf"], "svc__C": [1,10,100,1000], "svc__gamma": [0.001, 0.01]},
        {"svc__kernel": ["linear"], "svc__C": [1,10,100,1000]},
        {"svc__kernel": ["poly"], "svc__C": [1,10,100,1000], "svc__degree": [2,3,4]},
    ],
    cv = KFold(10, shuffle=True, random_state=1234),
    scoring = "roc_auc",
    n_jobs = 8
).fit(
    X, y
)


KeyboardInterrupt: 

### Exercise 1

In [27]:
from sklearn.datasets import load_digits
digits = load_digits(as_frame=True)
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, shuffle=True, random_state=1234
)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [26]:
p = Pipeline([
  ("model", DecisionTreeClassifier())  
])

In [None]:
digit_tree = GridSearchCV(
    p,
    param_grid = {
      "model": [
          DecisionTreeClassifier(),
          RandomForestClassifier()
      ],
      "model__criterion": ["gini", "entropy"],
      "model_max_depth": range(2,11)
    },
    cv = KFold(5, shuffle=True, random_state=12345),
    n_jobs=4
).fit(
    X_train, y_train
)

