# 0. An end to end Scikit Learn workflow

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
#create x(features matrix)
x= heart_disease.drop('target', axis = 1)

#create y(labels)
y=heart_disease['target']

In [4]:
#2. choose the right model and parameters
from sklearn.ensemble import RandomForestClassifier
clf  = RandomForestClassifier( n_estimators = 100)

#We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
#3. Fit the model to the training data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)  #since 0.2 that means 80% of data is used for training dataset and 20% is used for testing
# if i have 1000 datas in a dataset - 800 data is used for train and 200 is used for testing purpose


In [6]:
clf.fit(x_train, y_train);

In [7]:
#make prediction
y_preds = clf.predict(x_test);
y_preds

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1], dtype=int64)

In [8]:
y_test

190    0
205    0
290    0
214    0
197    0
      ..
36     1
63     1
118    1
181    0
100    1
Name: target, Length: 61, dtype: int64

In [9]:
#4.evaluate the model on the training data and the test data
clf.score(x_train, y_train)

1.0

In [10]:
clf.score(x_test, y_test)

0.8524590163934426

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.86      0.89      0.88        36

    accuracy                           0.85        61
   macro avg       0.85      0.84      0.85        61
weighted avg       0.85      0.85      0.85        61



In [12]:
confusion_matrix(y_test, y_preds)

array([[20,  5],
       [ 4, 32]], dtype=int64)

In [13]:
accuracy_score(y_test, y_preds)

0.8524590163934426

In [14]:
# 5.improve a model 
#try different amount n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train , y_train)
    print(f'Model accuracy on test set : {clf.score(x_test, y_test) * 100:.2f}%')
    print('')

Trying model with 10 estimators...
Model accuracy on test set : 83.61%

Trying model with 20 estimators...
Model accuracy on test set : 85.25%

Trying model with 30 estimators...
Model accuracy on test set : 77.05%

Trying model with 40 estimators...
Model accuracy on test set : 83.61%

Trying model with 50 estimators...
Model accuracy on test set : 85.25%

Trying model with 60 estimators...
Model accuracy on test set : 81.97%

Trying model with 70 estimators...
Model accuracy on test set : 81.97%

Trying model with 80 estimators...
Model accuracy on test set : 83.61%

Trying model with 90 estimators...
Model accuracy on test set : 83.61%



In [15]:
#6. save a model and load it
import pickle

pickle.dump(clf, open('random_forest_model_1.pkl' , 'wb'))

In [16]:
loaded_model = pickle.load(open('random_forest_model_1.pkl' , 'rb'))
loaded_model.score(x_test , y_test)

0.8360655737704918

# Every step in detail

## 1.Getting our data ready to be used with machine learning

three main things we have to do:
    1. split the data into features and labels ('x' and 'y')
    2. filling(aka imputing) or disregarding missing values
    3. converting non numerical values into numerical values(aka feature encoding)

In [17]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
x = heart_disease.drop('target' , axis =1)
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [19]:
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [20]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.2)

In [21]:
x_train.shape, x_test.shape, y_train.shape ,y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [22]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train, y_train)

RandomForestRegressor()

In [23]:
x.shape

(303, 13)

In [24]:
x.shape[0] * 0.8

242.4

In [25]:
242 +61

303

In [26]:
len(heart_disease)

303

## 1.1 Make sure it's all numerical

In [27]:
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [28]:
len(car_sales)

1000

In [29]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [30]:
#split into x/y
x = car_sales.drop('Price' , axis = 1)
y = car_sales['Price']

#split into training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [31]:
x.shape

(1000, 4)

In [32]:
x_train.shape, x_test.shape, y_train.shape , y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [33]:
#build ml model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state = 0)
model.fit(x_train, y_train)
model.score(x_test , y_test)

ValueError: could not convert string to float: 'Toyota'

In [None]:
car_sales['Doors'].value_counts() #eventhough 4,5,3 are numerical but they fall into the categories like car with 4 doors fit into 856 category likewise

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")

transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
x.head()

In [None]:
pd.DataFrame(transformed_x)

In [None]:
# alternate method to conv categ to num
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']])
dummies

In [None]:
# Let's refit the model
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x,
                                                    y, 
                                                    test_size=0.2)

model.fit(x_train, y_train)


In [None]:

model.score(x_test, y_test)

In [None]:
x.shape

In [None]:
y.shape

In [None]:
x_train.shape , y_train.shape

In [None]:
x_test.shape, y_test.shape

In [None]:
x.head()

### 1.2 What if there were missing values

1. fill them with some values(aka imputation)
2. remove the samples with missing data altogether

In [None]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
#create x and y
X = car_sales_missing.drop('Price' , axis=1)
y = car_sales_missing['Price']

In [None]:
#let's try and conv our data into numbers
#Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

Categorical_features = ['Make', 'Colour','Doors']
one_hot = ColumnTransformer([('one_hot', one_hot , Categorical_features)], remainder = 'passthrough')
transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
car_sales_missing

In [None]:
car_sales_missing['Doors'].value_counts()

#### Option 1:Fill missing datas with Pandas 

In [None]:
# Fill the 'Make' column
car_sales_missing['Make'].fillna('missing', inplace = True)

# Fill the 'Colour' column
car_sales_missing['Colour'].fillna('missing', inplace =True)

# Fill the 'Odometer (KM)' column
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace = True)

# Fill the 'Doors' column
car_sales_missing['Doors'].fillna(4, inplace = True)

In [None]:
# Check the dataframe again
car_sales_missing.isna().sum()

In [None]:
# Remove rows with empty value in Price column
car_sales_missing.dropna(inplace= True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing) # We have lost 50 datas in this process

In [None]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Let's try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

#### Option 2: Fill the missing values with scikit learn

In [None]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
#Drop the rows with no labels
car_sales_missing.dropna(subset=['Price'], inplace=True)
car_sales_missing.isna().sum()

In [None]:
# Split into X and y
X = car_sales_missing.drop('Price', axis =1)
y = car_sales_missing['Price']

In [None]:
# Fill missing values with Scikit learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categ values with 'missing' and numerical values with 'mean'

cat_imputer = SimpleImputer(strategy='constant', fill_value = 'missing')
door_imputer = SimpleImputer(strategy= 'constant', fill_value = 4)
num_imputer = SimpleImputer(strategy= 'mean')

#Define Columns
cat_features = ['Make', 'Colour']
door_features = ['Doors']
num_features = ['Odometer (KM)']

# Create an imputer(something that fills missing data)
imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_features),
                                ('door_imputer', door_imputer,door_features),
                                ('num_imputer', num_imputer, num_features)])


filled_X = imputer.fit_transform(X)
filled_X


In [None]:
# Get our transformed data array's back into DataFrame
car_sales_filled = pd.DataFrame(filled_X, columns = ['Make','Colour','Doors','Odometer (KM)'])
car_sales_filled.isna().sum()

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X = transformer.fit_transform(car_sales_filled)

# Check transformed and filled X_train
transformed_X

In [None]:
# Now we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size = 0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
len(car_sales_filled), len(car_sales)

## 2. Choosing the right estimator/algorithm for your problem
Some things to note:

*Sklearn refers to machine learning models, algorithms as estimators.
*Classification problem - predicting a category (heart disease or not)
*Sometimes you'll see clf (short for classifier) used as a classification estimator
*Regression problem - predicting a number (selling price of a car)

If you're working on a machine learning problem and looking to use Sklearn and not sure what model you should use, refer to the sklearn machine learning map: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### 2.1 Picking a ML model for regression
Let's use the California Housing dataset - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

In [None]:
# Get the California Housing Dataset

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing['data'], columns = housing['feature_names'])
housing_df

In [None]:
housing_df['target'] = housing['target']
housing_df.head()  

In [None]:
housing_df

In [None]:
# Import algorithm/estimator
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

# Create the data
X = housing_df.drop('target', axis = 1)
y = housing_df['target']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate and fit the model
model =Ridge()
model.fit(X_train, y_train)

# Check the score of the model
model.score(X_test, y_test)

In [None]:
# Importing algorithms/estimators
from sklearn.ensemble import RandomForestRegressor

#Setup random seed
np.random.seed(42)

#create X and y
X = housing_df.drop('target', axis = 1)
y = housing_df['target']

# Declaring the train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Declaring the models
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Finding the scores
model.score(X_test, y_test)

# 2.2 Choosing an estimator for classification

In [None]:
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease

In [None]:
len(heart_disease)

Consulting the map and it says to try 'linearSVC'

In [None]:
# Import the  linearSVC
from sklearn.svm import LinearSVC

np.random.seed(42)

#data ready

X= heart_disease.drop('target' , axis =1)
y= heart_disease['target']

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate LinearSVC
clf = LinearSVC(max_iter=10000)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

In [None]:
heart_diasease['target'].value_counts()

In [None]:
# Import the  RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

#data ready

X= heart_disease.drop('target' , axis =1)
y= heart_disease['target']

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate LinearSVC
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

In [None]:
heart_disease

# 3. Fit the model/algorithm on our data and use it to make predictions
### 3.1 Fitting the model to the data
Different names for:

X = features, features variables, data
y = labels, targets, target variables

In [None]:
# Import the  RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

#data ready

X= heart_disease.drop('target' , axis =1)
y= heart_disease['target']

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate LinearSVC
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

In [None]:
X.head()

In [None]:
y.tail()

# Random Forest model deep dive
These resources will help you understand what's happening inside the Random Forest models we've been using.

Random Forest Wikipedia
Random Forest Wikipedia (simple version)
Random Forests in Python by yhat
An Implementation and Explanation of the Random Forest in Python by Will Koehrsen

### 3.2 Make predictions using a machine learning model
2 ways to make predictions:

predict()
predict_proba()

In [None]:
# Use a trained model to make predictions
clf.predict(np.array([1, 7, 8, 3, 4])) # this doesn't work...

In [None]:
X_test.head()

In [None]:
clf.predict(X_test)

In [None]:
np.array(y_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds== y_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_preds, y_test)

Make predictions with predict_proba() - use this if someone asks you "what's the probability your model is assigning to each prediction?"

In [None]:
# predict_proba() returns probabilities of a classification label
clf.predict_proba(X_test)

In [None]:
clf.predict_proba(X_test[:5])

In [None]:
clf.predict(X_test[:5])

In [None]:
X_test[:5]

In [None]:
heart_disease['target'].value_counts()

predict() can also be used for regression models.

In [None]:
housing_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model instance
model = RandomForestRegressor()

# Fit the model to the data
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)

In [None]:
y_preds[:5]

In [None]:
np.array(y_preds[:5])

In [None]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_preds, y_test)

In [None]:
housing_df['target']

## 4. Evaluating a machine learning model
Three ways to evaluate Scikit-Learn models/estimators:

Estimator's built-in score() method
The scoring parameter
Problem-specific metric functions
You can read more about these here: https://scikit-learn.org/stable/modules/model_evaluation.html

### 4.1 Evaluating a model with the score method

In [None]:
heart_disease

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators=1000)

# Fit the model to the data (training the machine learning model)
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

Let's use the score() on our regression problem...

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model instance
model = RandomForestRegressor(n_estimators=100)

# Fit the model to the data
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## 4.2 Evaluating a model using the scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train)

clf.score(X_test, y_test)

In [None]:
cross_val_score(clf,X, y, cv = 5)

In [None]:
cross_val_score(clf,X, y, cv = 10)

In [None]:
np.random.seed(42)

# Single training and test data split
clf_single_score = clf.score(X_test, y_test)

# Take the mean of 5-fold cross-validation score
clf_cross_val_score = np.mean(cross_val_score(clf,X, y, cv = 10))

# Compare those two
clf_single_score, clf_cross_val_score

In [None]:
# Scoring parameter is set to none by default
cross_val_score(clf,X, y, cv = 10, scoring = None)

In [None]:
import sklearn
sklearn.__version__

## 4.2.1 Classification model evaluation metrics
Accuracy
Area under ROC curve
Confusion matrix
Classification report

### Accuracy

In [None]:
heart_disease.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis = 1)
y = heart_disease['target']

clf = RandomForestClassifier(n_estimators=100)
cross_val_score = cross_val_score(clf ,X, y, cv = 5)

In [None]:
np.mean(cross_val_score)

In [None]:
print(f'Heart Disease Classifier Cross-Validated Accuracy : {np.mean(cross_val_score)*100:.2f}%')

### Area under the receiver operating characteristic curve (AUC/ROC)



Area under curve (AUC)
ROC curve
ROC curves are a comparison of a model's true postive rate (tpr) versus a models false positive rate (fpr).

True positive = model predicts 1 when truth is 1
False positive = model predicts 1 when truth is 0
True negative = model predicts 0 when truth is 0
False negative = model predicts 0 when truth is 1

In [None]:
#Create test..
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

# Fit the classifier
clf.fit(X_train, y_train)

# Make predictions with probabilities
y_probs = clf.predict_proba(X_test)

y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[: , 1]
y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr, and threshold
fpr, tpr, threshold = roc_curve(y_test, y_probs_positive)

# Check the false positive rate
fpr

In [None]:
# Create a function for plotting ROC curve
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr)
    and true positive rate (tpr) of a model.
    """
    # Plot roc curve
    plt.plot(fpr, tpr, color = 'Orange', label = 'ROC')
    
    # Customise the plot
    plt.xlabel('X- axis')
    plt.ylabel('Y- axis')
    plt.title('The ROC curve')
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_probs_positive)

In [None]:
# Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC score
roc_auc_score(y_test, y_test)

### Confusion matrix
The next way to evaluate a classification model is by using a confusion matrix.

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict. In essence, giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test, y_preds)

In [None]:
pd.crosstab(y_test,
           y_preds,
           rownames= ['Actual Labels'],
           colnames= ['Predicted Labels'])

In [None]:
y_preds


In [None]:
y_test

### Creating a confusion matrix using Scikit-Learn
Scikit-Learn has multiple different implementations of plotting confusion matrices:

sklearn.metrics.ConfusionMatrixDisplay.from_estimator(estimator, X, y) - this takes a fitted estimator (like our clf model), features (X) and labels (y), it then uses the trained estimator to make predictions on X and compares the predictions to y by displaying a confusion matrix.
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_true, y_pred) - this takes truth labels and predicted labels and compares them by displaying a confusion matrix.
Note: Both of these methods/classes require Scikit-Learn 1.0+. To check your version of Scikit-Learn run:

import sklearn
sklearn.__version__
If you don't have 1.0+, you can upgrade at: https://scikit-learn.org/stable/install.html

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=clf,X=X,y=y)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true = y_test, y_pred = y_preds)

### Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # only one positive case

disease_preds = np.zeros(10000) # model predicts every case as 0
pd.DataFrame(classification_report(disease_true,
                                  disease_preds,
                                  output_dict = True,
                                  zero_division = 0))

## 4.2.2 Regression model evaluation metrics
Model evaluation metrics documentation - https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

The ones we're going to cover are:

R^2 (pronounced r-squared) or coefficient of determination
Mean absolute error (MAE)
Mean squared error (MSE)

### R^2

What R-squared does: Compares your models predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, it's R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1.

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
housing_df.head()

In [None]:
y_test

In [None]:
y_test.mean()

In [None]:
from sklearn.metrics import r2_score

y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_test_mean[:10]

In [None]:
r2_score(y_true = y_test, y_pred= y_test_mean)

In [None]:
r2_score(y_true = y_test, y_pred= y_test)

### Mean absolute error (MAE)

MAE is the average of the absolute differences between predictions and actual values.

It gives you an idea of how wrong your models predictions are.

In [None]:
# MAE
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test,y_preds)
mae

In [None]:
df = pd.DataFrame( data = {'actual value': y_test,
                          'predicted values': y_preds})
df['differences'] = df['predicted values'] - df['actual value']
df.head(10)

In [None]:
# MAE using formulas and differences
np.abs(df['differences']).mean()

### Mean squared error (MSE)

MSE is the mean of the square of the errors between actual and predicted values.

In [None]:
# MSE

from sklearn.metrics import mean_squared_error
y_preds = model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
df['squared differences'] = np.square(df['differences']).mean()
df.head()

In [None]:
#Calculate MSE by hand
squared = np.square(df["differences"])
squared.mean()

In [None]:
df_large_error = df.copy()
df_large_error.iloc[0]['squared differences'] = 16  # increase "squared_differences" for 1 sample

In [None]:
df_large_error.head()

In [None]:
df_large_error['squared differences'].mean()

In [None]:
# Artificially increase error in "squared_differences" column for ~100 samples
df_large_error.iloc[0:100, 3]
df_large_error

In [None]:
# Calculate MSE with large errors
df_large_error['squared differences'].mean()

## 4.2.3 Finally using the scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis = 1)
y = heart_disease['target']

clf = RandomForestClassifier(n_estimators=100)

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, X, y, cv = 5, scoring = None) 
cv_acc

In [None]:
# Cross validated accuracy
print(f'The Cross-validated accuracy: {np.mean(cv_acc)*100:.2f}%')

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, X, y, cv = 5, scoring = "accuracy") 
cv_acc

In [None]:
print(f'The Cross-validated accuracy: {np.mean(cv_acc)*100:.2f}%')

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_precision = cross_val_score(clf, X, y, cv = 5, scoring = "precision") 
cv_precision

In [None]:
print(f'The Cross-validated accuracy: {np.mean(cv_precision)*100:.2f}%')

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_recall = cross_val_score(clf, X, y, cv = 5, scoring = "recall") 
cv_recall

In [None]:
print(f'The Cross-validated accuracy: {np.mean(cv_recall)*100:.2f}%')

Let's see the scoring parameter being using for a regression problem...

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
X = housing_df.drop('target', axis = 1)
y = housing_df['target']

model = RandomForestRegressor(n_estimators=100)

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=3, scoring = None)
np.mean(cv_r2)

In [None]:
cv_r2

In [None]:
# Mean Squared Error
cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
np.mean(cv_mse)

In [None]:
cv_mse

In [None]:
# Mean Absolute Error
cv_mae = cross_val_score(model, X, y, cv = 5, scoring = 'neg_mean_absolute_error')
np.mean(cv_mae)

In [None]:
cv_mae

### 4.3 Using different evaluation metrics as Scikit-Learn functions
The 3rd way to evaluate scikit-learn machine learning models/estimators is to using the sklearn.metrics module - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create X & y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model
clf = RandomForestClassifier()

# Fit model
clf.fit(X_train, y_train)

# Make predictions
y_preds = clf.predict(X_test)

# Evaluate model using evaluation functions
print("Classifier metrics on the test set")
print(f"Accurracy: {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision: {precision_score(y_test, y_preds)}")
print(f"Recall: {recall_score(y_test, y_preds)}")
print(f"F1: {f1_score(y_test, y_preds)}")

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create X & y
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model
model = RandomForestRegressor()

# Fit model
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)

# Evaluate model using evaluation functions
print("Regression metrics on the test set")
print(f"R2 score: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

## 5. Improving a model
First predictions = baseline predictions. First model = baseline model.

From a data perspective:

Could we collect more data? (generally, the more data, the better)
Could we improve our data?
From a model perspective:

Is there a better model we could use?
Could we improve the current model?
Hyperparameters vs. Parameters

Parameters = model find these patterns in data
Hyperparameters = settings on a model you can adjust to (potentially) improve its ability to find patterns
Three ways to adjust hyperparameters:

By hand
Randomly with RandomSearchCV
Exhaustively with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)

In [None]:
clf.get_params()

### 5.1 Tuning hyperparameters by hand

Let's make 3 sets, training, validation and test.

In [None]:
clf.get_params()

We're going to try and adjust:

max_depth               
max_features               
min_samples_leaf            
min_samples_split               
n_estimators            

In [None]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dist = {'accuracy': round(accuracy, 2),
                  'precision' : round(precision, 2),
                  'recall': round(recall, 2),
                  'f1': round(f1, 2)}
    print(f'Accuracy : {accuracy * 100:.2f}%')
    print(f'Precision : {precision:.2f}')
    print(f'Recall : {recall:.2f}')
    print(f'F1 : {f1:.2f}')
    
    return metric_dist

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Shuffle the data
heart_disease_shuffled = heart_disease.sample(frac = 1)

# Split into X & y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split the data into train, validation & test sets
train_split = round(0.7 * len(heart_disease_shuffled)) # 70% of data
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # 15% of data
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split : valid_split], y[train_split : valid_split]
X_test, y_test = X[:valid_split], y[:valid_split]

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make baseline predictions
y_preds = clf.predict(X_valid)

# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

In [None]:
np.random.seed(42)

# Create a second classifier with different hyperparameters
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

# Make predictions with different hyperparameters
y_preds_2 = clf_2.predict(X_valid)

# Evalute the 2nd classsifier
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

## 5.2 Hyperparameter tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
       'max_depth' : [None , 5, 10, 20, 30],
       'max_features': ['auto', 'sqrt'],
       'min_samples_split': [2,4,6],
       'min_samples_leaf' : [1, 2, 4]}

np.random.seed(42)

# Split into X & y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator = clf,
                           param_distributions = grid,
                           n_iter = 10, # number of models to try
                           cv = 5,
                           verbose = 2)

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);

In [None]:
rs_clf.best_params_

In [None]:
# Make predictions with the best hyperparameters
rs_y_preds = rs_clf.predict(X_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

## 5.3 Hyperparameter tuning with GridSearchCV

In [None]:
grid

In [None]:
grid_2 = {'n_estimators': [100, 200, 500],
          'max_depth': [None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_split': [6],
          'min_samples_leaf': [1, 2]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# Split into X & y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup GridSearchCV
gs_clf = GridSearchCV(estimator=clf,
                      param_grid=grid_2, 
                      cv=5,
                      verbose=2)

# Fit the GridSearchCV version of clf
gs_clf.fit(X_train, y_train);

In [None]:
gs_clf.best_params_

In [None]:
gs_y_preds = gs_clf.predict(X_test)

# evaluate the predictions
gs_metrics = evaluate_preds(y_test, gs_y_preds)

Let's compare our different models metrics.

In [None]:
compare_metrics = pd.DataFrame({"baseline": baseline_metrics,
                                "clf_2": clf_2_metrics,
                                "random search": rs_metrics,
                                "grid search": gs_metrics})

compare_metrics.plot.bar(figsize=(10, 8));

## 6. Saving and loading trained machine learning models
Two ways to save and load machine learning models:

With Python's pickle module                
With the joblib module

### Pickle

In [None]:
import pickle

# Save an extisting model to file
pickle.dump(gs_clf, open("gs_random_random_forest_model_1.pkl", 'wb'))

In [None]:
#Load a saved model
loaded_pickle_model = pickle.load(open("gs_random_random_forest_model_1.pkl", 'rb'))

In [None]:
# Make some predictions
pickle_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)