# Dependencies

In [2]:
# dependancies
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn import linear_model as lin 
from sklearn import preprocessing as pre
from sklearn import metrics as met
from sklearn import model_selection as mod
from sklearn import pipeline as pip
from sklearn import datasets as dat
from sklearn import tree
from sklearn import preprocessing as pre
from sklearn import ensemble as ens
from sklearn import svm
from sklearn import decomposition as dec
from sklearn import manifold as man

np.set_printoptions(suppress=True) 

import warnings
warnings.filterwarnings("ignore")

# Functions

In [None]:
def plotLearningCurve(est, X_train, y_train, X_test, y_test, n_iter=10, minY=0,maxY=1):
    x_values = [] #number of training samples
    y_values_train = []
    y_values_test = []

    for i in np.linspace(10,len(X_train),n_iter): #increase training sizes by every iter
        i=int(i)
        #select a subset of training data
        X_train_temp = X_train[:i]
        y_train_temp = y_train[:i]
        #create the model
        est.fit(X_train_temp, y_train_temp)
        #evaluate train set
        y_pred_train=est.predict(X_train_temp)
        train_score = met.mean_squared_error(y_train_temp,y_pred_train,squared=False)
        #evaluate test set
        y_pred_test=est.predict(X_test)
        test_score = met.mean_squared_error(y_test,y_pred_test,squared=False)
        #populate lists
        y_values_train.append(train_score)
        y_values_test.append(test_score)
        x_values.append(i)
    plt.figure(figsize=(10,6))
    plt.plot(x_values,y_values_train,label="Train")
    plt.plot(x_values,y_values_test, label="Test")
    plt.legend()
    plt.ylabel("RMSE")
    plt.xlabel("# of training samples")
    plt.grid(True)
    plt.ylim(minY,maxY)
    plt.show()

In [None]:
def plotDecisionBoundaries(clf, X, y, n_classes=2):
  # Parameters
  plot_colors = "rb"
  plot_step = 0.02

  X2 = X.values
  y_now = y.values


  for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                  [1, 2], [1, 3], [2, 3]]):
      # We only take the two corresponding features
      X_now = X2[:, pair]
      
      

      # Train
      clf.fit(X_now, y_now)

      # Plot the decision boundary
      plt.subplot(2, 3, pairidx + 1)

      x_min, x_max = X_now[:, 0].min() - 1, X_now[:, 0].max() + 1
      y_min, y_max = X_now[:, 1].min() - 1, X_now[:, 1].max() + 1
      xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                          np.arange(y_min, y_max, plot_step))
      plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

      Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
      Z = Z.reshape(xx.shape)
      cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

      

      # Plot the training points
      for i, color in zip(range(n_classes), plot_colors):
          idx = np.where(y_now == i)
          
          plt.scatter(X_now[idx, 0], X_now[idx, 1], c=color,
                      cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

  plt.suptitle("Decision surface of a decision tree using paired features")
  plt.legend(loc='lower right', borderpad=0, handletextpad=0)
  plt.axis("tight")
  plt.show()

In [None]:
def estimate_roc_curve(X_train, y_train_binary, estimators=[[tree.DecisionTreeClassifier(random_state=42),
                lin.SGDClassifier(random_state=42),
                ens.RandomForestClassifier(random_state=42)]]):
    for est in estimators:
        pipe = pip.Pipeline([
            ("scaler", pre.StandardScaler()),
            ("est",est)
        ])
        if hasattr(est,"predict_proba"):
            myMethod = "predict_proba" #returns list of lists
        else:
            myMethod = "decision_function" 

        y_scores = mod.cross_val_predict(pipe, X_train, y_train_binary,
                                        cv=5, method=myMethod)
        if myMethod == "predict_proba":
            y_scores = y_scores[:, 1] #probabilities for true class
        fpr, tpr, thresholds = met.roc_curve(y_train_binary,y_scores)
        auc_score = met.roc_auc_score(y_train_binary,y_scores)
        print(est.__class__.__name__,auc_score)
        plt.plot(fpr, tpr, label=est.__class__.__name__)
    plt.title("ROC Curve")
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.legend()
    plt.show()

In [None]:
def precision_recall_curve(est, X_train, y_train_binary):
    
    pipe = pip.Pipeline([("scaler", pre.StandardScaler())
                         ,("est",est)])
    
    if hasattr(est,"predict_proba"):
        myMethod = "predict_proba" 
    else:
        myMethod = "decision_function" 
    
    y_scores = mod.cross_val_predict(pipe,X_train,y_train_binary,cv=3, method=myMethod)
    
    precision, recall, thresholds = met.precision_recall_curve(y_train_binary, y_scores)
    
    plt.figure(figsize=(10,6))
    plt.plot(thresholds,precision[:-1], label = "precision")
    plt.plot(thresholds,recall[:-1], label = "recall")
    plt.legend()
    plt.xlabel("Thresholds")
    plt.ylabel("Precision / Recall")
    plt.show()
    
    fpr, tpr, thresholds = met.roc_curve(y_train_binary,y_scores)
    plt.plot(fpr,tpr)
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.show()
    