# Model Evaluation

[Link to the dataset](https://www.kaggle.com/mikecalgary/diamond-model-optimization-for-99-accuracy)

## Loading and preparing the dataset

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from comet_ml import Experiment
import pandas as pd

df = pd.read_csv('source/diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.shape

(53940, 11)

In [5]:
df = df.drop(["Unnamed: 0"], axis=1)

In [6]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(data):
    categories = (data.dtypes =="object")
    cat_cols = list(categories[categories].index)
    
    feature_label_encoder_dict = {}
    for col in cat_cols:
        feature_label_encoder_dict[col] = LabelEncoder()
        X[col] = feature_label_encoder_dict[col].fit_transform(X[col])

In [7]:
from sklearn.preprocessing import StandardScaler

def scale_numerical(data):
    scaler = StandardScaler()
    data[data.columns] = scaler.fit_transform(data[data.columns])

## Classification

In [8]:
def set_target(x):
    golden_set = ['Ideal', 'Premium', 'Very Good']
    if x in golden_set:
        return 'Gold'
    return 'Silver'
df['target'] = df['cut'].apply(lambda x: set_target(x))
df.drop("cut", axis = 1,inplace=True)

In [9]:
X = df.drop("target", axis = 1)
y = df["target"]

In [9]:
encode_labels(X)
scale_numerical(X)

In [10]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def compute_metrics(y_pred, y_true):
    metrics = {}
    metrics['precision'] = precision_score(y_true, y_pred)
    metrics['recall'] = recall_score(y_true, y_pred)
    metrics['f1-score'] = f1_score(y_true, y_pred)
    metrics['accuracy'] =  accuracy_score(y_true, y_pred)
    return metrics

In [13]:
from sklearn.metrics import roc_curve
import pickle

def run_experiment(ModelClass, name):
    experiment = Experiment()
    experiment.set_name(name)
    experiment.add_tag(name)
    
    model = ModelClass()
    with experiment.train():    
        model.fit(X_train, y_train)
        y_pred = model.predict(X_train)
        metrics = compute_metrics(y_pred, y_train)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_train, y_pred)
        
        file_name = name + '.pkl'
        with open(file_name, 'wb') as file:  
            pickle.dump(model, file)
            experiment.log_model(name, file_name)
        
    
    with experiment.validate():
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_pred, y_test)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_test, y_pred)
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        experiment.log_curve(name, fpr, tpr)
        
    experiment.end()

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

run_experiment(RandomForestClassifier, 'RandomForest')
run_experiment(DecisionTreeClassifier, 'DecisionTreeClassifier')
run_experiment(GaussianNB, 'GaussianNB')
run_experiment(KNeighborsClassifier, 'KNeighborsClassifier')

COMET ERROR: The given API key AzbjrGVNvJDLMGTmwEUiJW9ah is invalid, please check it against the dashboard. Your experiment would not be logged 
COMET ERROR: The given API key AzbjrGVNvJDLMGTmwEUiJW9ah is invalid, please check it against the dashboard. Your experiment would not be logged 
COMET ERROR: The given API key AzbjrGVNvJDLMGTmwEUiJW9ah is invalid, please check it against the dashboard. Your experiment would not be logged 
COMET ERROR: The given API key AzbjrGVNvJDLMGTmwEUiJW9ah is invalid, please check it against the dashboard. Your experiment would not be logged 


In [15]:

def run_experiment_with_epoch(ModelClass, name, n_epochs):
    experiment = Experiment()
    experiment.set_name(name)
    experiment.add_tag(name)
    experiment.add_tag('epoch')
    
    with experiment.train():
        for i in range(n_epochs):
            model = ModelClass(max_iter=n_epochs)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_train)
            metrics = compute_metrics(y_pred, y_train)
            experiment.log_metrics(metrics, epoch = i)
            experiment.log_confusion_matrix(y_train, y_pred, epoch=i)
    
    with experiment.validate():
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_pred, y_test)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_test, y_pred)
        
        
    experiment.end()

In [235]:
from sklearn.linear_model import SGDClassifier
run_experiment_with_epoch(SGDClassifier, 'SGD',100)

COMET INFO: Couldn't find a Git repository in '/Users/angelica/Packt/Comet/Code/03' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/packt/model-evaluation/1d28bf1084b14f4383cce7be5affbfdb

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/packt/model-evaluation/1d28bf1084b14f4383cce7be5affbfdb
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     train_accuracy [100]  : (0.8816509084167594, 0.9080923248053393)
COMET INFO:     train_f1-score [100]  : (0.027052771956563155, 0.4026502032826381)
COMET INFO:     train_precision [100] : (0.7996742671009772, 1.0)
COMET INFO:     train_recall [100]    : (0.01371450647092911, 0.25825767819200307)
COMET

In [236]:
def run_experiment_with_steps(ModelClass, name):
    step_size = len(X_train)
    min_steps = 20
    experiment = Experiment()
    experiment.set_name(name)
    experiment.add_tag(name)
    
    with experiment.train():
        for i in np.arange(min_steps, step_size+1, step = 5000):
            model = ModelClass()
            X_t = X_train[0:i]
            y_t = y_train[0:i]
            model.fit(X_t, y_t)
            y_pred = model.predict(X_t)
            metrics = compute_metrics(y_pred, y_t)
            experiment.log_metrics(metrics, step = i)
            experiment.log_confusion_matrix(y_t, y_pred, step=i)
    
    with experiment.validate():
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_pred, y_test)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_test, y_pred)
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        experiment.log_curve(name, fpr, tpr)
        
    experiment.end()

In [237]:
run_experiment_with_steps(RandomForestClassifier, 'RandomForestWithSteps')
run_experiment_with_steps(DecisionTreeClassifier, 'DecisionTreeWithSteps')
run_experiment_with_steps(GaussianNB, 'GaussianNBWithSteps')
run_experiment_with_steps(KNeighborsClassifier, 'KNeighborsClassifierWithSteps')

COMET INFO: Couldn't find a Git repository in '/Users/angelica/Packt/Comet/Code/03' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/packt/model-evaluation/3cc8ae33a1934c968185a890d0175d0b

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/packt/model-evaluation/3cc8ae33a1934c968185a890d0175d0b
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     train_accuracy [9] : (0.99995004995005, 1.0)
COMET INFO:     train_f1-score [9] : (0.9997917967936706, 1.0)
COMET INFO:     train_precision    : 1.0
COMET INFO:     train_recall [9]   : (0.9995836802664446, 1.0)
COMET INFO:     validate_accuracy  : 0.9572673340748981
COMET INFO:     validate_f1-score 

COMET INFO:   Others:
COMET INFO:     Name : KNeighborsClassifierWithSteps
COMET INFO:   Parameters:
COMET INFO:     train_algorithm     : auto
COMET INFO:     train_leaf_size     : 30
COMET INFO:     train_metric        : minkowski
COMET INFO:     train_metric_params : 1
COMET INFO:     train_n_jobs        : 1
COMET INFO:     train_n_neighbors   : 5
COMET INFO:     train_p             : 2
COMET INFO:     train_weights       : uniform
COMET INFO:   Uploads:
COMET INFO:     confusion-matrix    : 10
COMET INFO:     curve               : 1
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     notebook            : 1
COMET INFO:     source_code         : 1
COMET INFO: ---------------------------
COMET INFO: Uploading 1 metrics, params and output messages
COMET INFO: Waiting for completion of the file uploads (may take several seconds)
COMET INFO: The Python SDK has 10800 seconds to finish before aborting...
C