# Model Evaluation

[Link to the dataset](https://www.kaggle.com/mikecalgary/diamond-model-optimization-for-99-accuracy)

## Loading and preparing the dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from comet_ml import Experiment
import pandas as pd

df = pd.read_csv('source/diamonds.csv')
df.head()

In [None]:
df.shape

In [None]:
df = df.drop(["Unnamed: 0"], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(data):
    categories = (data.dtypes =="object")
    cat_cols = list(categories[categories].index)
    
    feature_label_encoder_dict = {}
    for col in cat_cols:
        feature_label_encoder_dict[col] = LabelEncoder()
        X[col] = feature_label_encoder_dict[col].fit_transform(X[col])

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_numerical(data):
    scaler = StandardScaler()
    data[data.columns] = scaler.fit_transform(data[data.columns])

## Classification

In [None]:
def set_target(x):
    golden_set = ['Ideal', 'Premium', 'Very Good']
    if x in golden_set:
        return 'Gold'
    return 'Silver'
df['target'] = df['cut'].apply(lambda x: set_target(x))
df.drop("cut", axis = 1,inplace=True)

In [None]:
X = df.drop("target", axis = 1)
y = df["target"]

In [None]:
encode_labels(X)
scale_numerical(X)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def compute_metrics(y_pred, y_true):
    metrics = {}
    metrics['precision'] = precision_score(y_true, y_pred)
    metrics['recall'] = recall_score(y_true, y_pred)
    metrics['f1-score'] = f1_score(y_true, y_pred)
    metrics['accuracy'] =  accuracy_score(y_true, y_pred)
    return metrics

In [None]:
from sklearn.metrics import roc_curve
import pickle

def run_experiment(ModelClass, name):
    experiment = Experiment()
    experiment.set_name(name)
    experiment.add_tag(name)
    
    model = ModelClass()
    with experiment.train():    
        model.fit(X_train, y_train)
        y_pred = model.predict(X_train)
        metrics = compute_metrics(y_pred, y_train)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_train, y_pred)
        
        file_name = name + '.pkl'
        with open(file_name, 'wb') as file:  
            pickle.dump(model, file)
            experiment.log_model(name, file_name)
        
    
    with experiment.validate():
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_pred, y_test)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_test, y_pred)
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        experiment.log_curve(name, fpr, tpr)
        
    experiment.end()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

run_experiment(RandomForestClassifier, 'RandomForest')
run_experiment(DecisionTreeClassifier, 'DecisionTreeClassifier')
run_experiment(GaussianNB, 'GaussianNB')
run_experiment(KNeighborsClassifier, 'KNeighborsClassifier')

In [None]:

def run_experiment_with_epoch(ModelClass, name, n_epochs):
    experiment = Experiment()
    experiment.set_name(name)
    experiment.add_tag(name)
    experiment.add_tag('epoch')
    
    with experiment.train():
        for i in range(n_epochs):
            model = ModelClass(max_iter=n_epochs)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_train)
            metrics = compute_metrics(y_pred, y_train)
            experiment.log_metrics(metrics, epoch = i)
            experiment.log_confusion_matrix(y_train, y_pred, epoch=i)
    
    with experiment.validate():
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_pred, y_test)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_test, y_pred)
        
        
    experiment.end()

In [None]:
from sklearn.linear_model import SGDClassifier
run_experiment_with_epoch(SGDClassifier, 'SGD',100)

In [None]:
def run_experiment_with_steps(ModelClass, name):
    step_size = len(X_train)
    min_steps = 20
    experiment = Experiment()
    experiment.set_name(name)
    experiment.add_tag(name)
    
    with experiment.train():
        for i in np.arange(min_steps, step_size+1, step = 5000):
            model = ModelClass()
            X_t = X_train[0:i]
            y_t = y_train[0:i]
            model.fit(X_t, y_t)
            y_pred = model.predict(X_t)
            metrics = compute_metrics(y_pred, y_t)
            experiment.log_metrics(metrics, step = i)
            experiment.log_confusion_matrix(y_t, y_pred, step=i)
    
    with experiment.validate():
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_pred, y_test)
        experiment.log_metrics(metrics)
        experiment.log_confusion_matrix(y_test, y_pred)
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        experiment.log_curve(name, fpr, tpr)
        
    experiment.end()

In [None]:
run_experiment_with_steps(RandomForestClassifier, 'RandomForestWithSteps')
run_experiment_with_steps(DecisionTreeClassifier, 'DecisionTreeWithSteps')
run_experiment_with_steps(GaussianNB, 'GaussianNBWithSteps')
run_experiment_with_steps(KNeighborsClassifier, 'KNeighborsClassifierWithSteps')