# Introduction to Scikit-Learn (sklearn)
This notebook demonstrate some of the most useful functions of Scikit Learn
What we are going to cover:

0. An end-to-end Scikit Learn Workflow
1. Getting Data Ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together

# 0. An end to end Scikit Learn Workflow

In [2]:
#1. Get the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("../data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# 1. Create X (features matrix)
X = heart_disease.drop("target", axis=1)

# Create Y (labels)
Y = heart_disease['target']

In [4]:
# 2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# We will keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [6]:
clf.fit(X_train, Y_train)

In [7]:
# Make a prediction
Y_preds = clf.predict(X_test)
Y_preds

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1])

In [8]:
# 4. Evaluate the model on training data and test data
clf.score(X_train, Y_train)

1.0

In [9]:
clf.score(X_test, Y_test)

0.8032786885245902

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(Y_test,Y_preds))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77        27
           1       0.81      0.85      0.83        34

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61



In [11]:
confusion_matrix(Y_test,Y_preds)

array([[20,  7],
       [ 5, 29]])

In [12]:
accuracy_score(Y_test,Y_preds)

0.8032786885245902

In [13]:
# 5. Improve a model
# Try different amount of estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f'Trying model with {i} estimators..')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, Y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators..
Model accuracy on test set: 78.69%

Trying model with 20 estimators..
Model accuracy on test set: 81.97%

Trying model with 30 estimators..
Model accuracy on test set: 77.05%

Trying model with 40 estimators..
Model accuracy on test set: 81.97%

Trying model with 50 estimators..
Model accuracy on test set: 80.33%

Trying model with 60 estimators..
Model accuracy on test set: 83.61%

Trying model with 70 estimators..
Model accuracy on test set: 80.33%

Trying model with 80 estimators..
Model accuracy on test set: 77.05%

Trying model with 90 estimators..
Model accuracy on test set: 80.33%



In [14]:
# 6. Save a model and load it
import pickle
pickle.dump(clf, open("random_forest_model_1.pkl","wb"))

In [15]:
loaded_model = pickle.load(open("random_forest_model_1.pkl","rb"))
loaded_model.score(X_test, Y_test)

0.8032786885245902

# 1. Getting Data Ready
Things to remember:

    1. Split the data into features and labels usually 'X' denotes features and 'Y' denotes lables
    2. Filling also called computing and disregarding missing values
    3. Converting non numerical values to numerical values (feature encoding)

In [16]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
X = heart_disease.drop("target", axis = 1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [19]:
y = heart_disease['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [20]:
# Split the data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 1.1 Make sure it's all numerical

In [22]:
car_sales = pd.read_csv('../data/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [23]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [24]:
# Split it into X/y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]
# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [25]:
# Build an ml model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: could not convert string to float: 'Toyota'

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")
transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
dummies = pd.get_dummies(car_sales[["Make","Colour","Doors"]])
dummies

In [None]:
# Let's refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,y, test_size = 0.2)
model.fit(X_train, y_train)

In [None]:
model.score(X_test,y_test)

### 1.2 What if there were missing values
1. Fill them with some value (also known as imputation).
1. Remove the samples with missing data altogether.

In [None]:
# Import car sales missing data
car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

In [None]:
# Create X and y
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [None]:
# Let's convert data to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")
transformed_X = transformer.fit_transform(X)

In [None]:
# Filling data
car_sales_missing["Make"] = car_sales_missing["Make"].fillna("missing")
car_sales_missing["Colour"] = car_sales_missing["Colour"].fillna("missing")
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean())
car_sales_missing["Door"] = car_sales_missing["Doors"].fillna(4)

In [None]:
# Removing Rows with missing price column
car_sales_missing = car_sales_missing.dropna()

In [None]:
car_sales_missing.isna().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot, categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(X)

In [None]:
# Still same because in newer version of sklearn OneHotEncoder automatically drop na data
transformed_X

In [None]:
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot, categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(X)

In [None]:
transformed_X

In [None]:
len(car_sales_missing)

# 2. Filling values to scikit learn

In [None]:
car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Drop nan values from label
car_sales_missing = car_sales_missing.dropna(subset=["Price"])

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Split into X and y
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [None]:
# Fill missing values with scikit learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' and numerical values with mean
cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")
door_imputer = SimpleImputer(strategy="constant",fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define Columns
cat_features = ["Make","Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer to fill data
imputer = ColumnTransformer([("cat_imputer",cat_imputer,cat_features),("door_imputer", door_imputer, door_features),("num_imputer",num_imputer,num_features)])

# Transform data
filled_x = imputer.fit_transform(X)
filled_x

In [None]:
car_sales_filled = pd.DataFrame(filled_x,
                               columns=["Make","Colour","Doors","Odometer (KM)"])

In [None]:
car_sales_filled

In [None]:
car_sales_filled.isna().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot, categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(car_sales_filled)

In [None]:
transformed_X 

In [None]:
# Now we have got our data as numbers and filled (no missing values)
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_X,y,test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
len(car_sales_filled)

In [None]:
len(car_sales)

# Choosing the right estimator for your problem

Some things to note:
* Sklearn refers to machine learning models, alsgorithms as estimators
* Classification Problem - predicting a category (heart disease or not)
* Sometimes you will see `clf` (short for classification) used as a classification estimator
* Regression Problem - Predicting a number (selling price of a car)

Sklearn machine learning map -> https://scikit-learn.org/stable/tutorial/machine_learning_map/

### 2.1 Picking a machine learning model for a regression problem
Let's use california housing dataset

In [None]:
# Get california housing dataset
from sklearn.datasets import fetch_california_housing
housing  = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing["data"],columns=housing["feature_names"])
housing_df.head()

In [None]:
housing_df["target"] = housing["target"]
housing_df.head()

In [None]:
housing_df = housing_df.drop("MedHouseVal",axis=1)

In [None]:
housing_df

In [None]:
# Import algorithm/estimator
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

# Create data
X = housing_df.drop("target",axis=1)
y = housing_df["target"]

# Split the data
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2)

# Instantiate and fit the model
model = Ridge()
model.fit(X_train,y_train)

# Check the score
model.score(X_test,y_test)

What if `Ridge` could not work.

Well, we could always try a different model

We can try ensemble model (This model is combination of smaller models rather than a single model)

In [None]:
# Import the RandomForestRegressor from ensemble module
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

# Create the data
X = housing_df.drop("target",axis=1)
y = housing_df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Create a random forest model
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test,y_test)

## 2.2 Picking a ml model for a classification model

In [None]:
# Data
from sklearn.datasets import load_iris
iris_data = load_iris()
iris_data

In [None]:
iris_df = pd.DataFrame(iris_data["data"],columns=iris_data["feature_names"])
iris_df 

In [None]:
iris_df["target"] = iris_data["target"]

In [None]:
iris_df

In [None]:
X = iris_df.drop('target',axis=1)
y = iris_df['target']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
# Import the model
from sklearn.linear_model import SGDClassifier
model = SGDClassifier()
model.fit(X_train, y_train)
model.score(X_test,y_test)

In [None]:
heart_data = pd.read_csv("../data/heart-disease.csv")
heart_data

In [None]:
# Features and labels
X = heart_data.drop("target",axis=1)
y = heart_data['target']

In [None]:
# Split data
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn import svm
model = svm.LinearSVC(dual="auto")
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
# Let's try with RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

X = heart_data.drop("target",axis=1)
y = heart_data["target"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Select model
model = RandomForestClassifier()

# fitting the data
model.fit(X_train, y_train)
model.score(X_test,y_test)

## 3. Fit the model/algorithm on our data and use it to make predictions
### 3.1 Fitting the data

* `X`= feature variables and data
* `y` = Labels

In [None]:
# Import sklearn 
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

# Make the data
X = heart_data.drop("target",axis=1)
y = heart_data["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

model = RandomForestClassifier()

# Fit the model to data (training ml model)
model.fit(X_train,y_train)

# Evaluate the Random Forest Classifier
model.score(X_test,y_test)

## 3.2 Make predictions using a machine learning model
2 Ways to make predictions 

1. `predict()`
2. `predict_proba()`

In [None]:
# Use a trained model to make predictions
model.predict(np.array([1,8,9,1,2,3,4])) # this does not work

In [None]:
# The shape of X_test is same as X_train
model.predict(X_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = model.predict(X_test)
np.mean(y_preds == y_test)

In [None]:
# Another way
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_preds)

### Make predictions with predict_proba

In [None]:
# it returns probabilities of a classification label
model.predict_proba(X_test[:5])

In [None]:
# Let's predict on the same data..
model.predict(X_test[:5])

`predict()` can also be used for regression model

In [27]:
from sklearn.datasets import fetch_california_housing
housing_data = fetch_california_housing()
housing_data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [28]:
housing_df = pd.DataFrame(housing_data.data,columns=[housing_data['feature_names']])
housing_df['target'] = housing_data['target']

In [29]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [77]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

# Create data
X = housing_df.drop("target",axis=1)
y = housing_df["target"]

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Create model
model = RandomForestRegressor()

# Fit the model
model.fit(X_train,y_train)

# Make predictions
y_preds = model.predict(X_test)

  X = housing_df.drop("target",axis=1)
  return fit_method(estimator, *args, **kwargs)


In [31]:
y_preds[:10]

array([0.49384  , 0.75494  , 4.9285964, 2.54029  , 2.33176  , 1.6549701,
       2.34323  , 1.66182  , 2.47489  , 4.8344779])

In [32]:
np.array(y_test[:10])

array([[0.477  ],
       [0.458  ],
       [5.00001],
       [2.186  ],
       [2.78   ],
       [1.587  ],
       [1.982  ],
       [1.575  ],
       [3.4    ],
       [4.466  ]])

In [33]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_preds)

0.3265721842781009

## 4. Evaluating a machine learning model

Three ways to evaluate Scikit Learn models/estimators:
 1. Estimators `buit-in score()` method
 2. The `scoring` paramter
 3. Problem-specific metric functions

### 4.1 Evaluating a model with the `score` method

In [58]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

# Create X and y
X = heart_disease.drop("target",axis=1)
y = heart_disease["target"]

# Create train/test
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2)

# Create classifier model instance
model = RandomForestClassifier()

# Fit classifier to training data
model.fit(X_train, y_train)

In [59]:
# The highest value for the score method is 1.0 and lowest is 0.0
model.score(X_train,y_train) * 100

100.0

In [42]:
y_train

132    1
202    0
196    0
75     1
176    0
      ..
188    0
71     1
106    1
270    0
102    1
Name: target, Length: 242, dtype: int64

In [43]:
model.score(X_test,y_test)

0.8524590163934426

Let's use the score on our regression dataset

In [62]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=80)
X = housing_df.drop("target",axis=1)
y = housing_df["target"]
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
model.fit(X_train,y_train)

  X = housing_df.drop("target",axis=1)
  return fit_method(estimator, *args, **kwargs)


In [63]:
# The default score() evaluation metric is r_squared for regression algorithm
model.score(X_test,y_test)

0.8182829949054876

## 4.2 Evaluating using `scoring` parameter

In [65]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

# Create X and y
X = heart_disease.drop("target",axis=1)
y = heart_disease["target"]

# Create train/test
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2)

# Create classifier model instance
model = RandomForestClassifier()

# Fit classifier to training data
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.8524590163934426

In [67]:
cross_val_score(model,X,y,cv=5)

array([0.83606557, 0.8852459 , 0.7704918 , 0.8       , 0.8       ])

In [68]:
np.random.seed(42)

# Signle training and test split score
model_single_score = model.score(X_test,y_test)

# Take the mean of 5 fold cross-validation score
model_cross_val_score = np.mean(cross_val_score(model,X,y,cv=5))

# Compare the two
model_single_score, model_cross_val_score

(0.8524590163934426, 0.8248087431693989)

In [69]:
# Scoring parameter set to none by default
# Default scoring parameter of classifier is mean accuracy
cross_val_score(model,X,y,cv=5)

array([0.78688525, 0.86885246, 0.80327869, 0.78333333, 0.76666667])

### 4.2.1 Classification model evaluation metrics
1. Accuracy
2. Area under ROC Curve
3. Confusion Matrix
5. Classification report

**Accuracy**

In [81]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
X = heart_disease.drop("target",axis=1)
y = heart_disease["target"]

# Accuracy
model = RandomForestClassifier()
cross_val_score = cross_val_score(model,X,y,cv=5)

In [82]:
np.mean(cross_val_score)

0.8248087431693989

In [83]:
print(f"Heart Disease Cross-Validated Classifier Accuracy {np.mean(cross_val_score)*100:.2f}%")

Heart Disease Cross-Validated Classifier Accuracy 82.48%


**Area under the receiver operating characteristic curve (AUC/ROC)**

ROC curves are a comparison of a model's true positive rate (tpr) versus a model false positive rate (fpr)

* True Positive -> Model predict 1 when truth is 1
* False Positive -> Model predicts 1 when truth is 0
* True negative -> Model predict 0 when truth is 0
* False negative -> model predicts 0 whrn truth is 1

In [84]:
# Create X_test..
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [89]:
from sklearn.metrics import roc_curve

# Fit the classifier
model.fit(X_train, y_train)

# Make predictions with probabilities
y_probs = model.predict_proba(X_test)

y_probs[:10], len(y_probs)

(array([[0.57, 0.43],
        [0.23, 0.77],
        [0.52, 0.48],
        [0.72, 0.28],
        [0.58, 0.42],
        [0.12, 0.88],
        [0.32, 0.68],
        [0.97, 0.03],
        [0.07, 0.93],
        [0.41, 0.59]]),
 61)

In [91]:
y_probs_positive = y_probs[:,1]
y_probs_positive[:10]

array([0.43, 0.77, 0.48, 0.28, 0.42, 0.88, 0.68, 0.03, 0.93, 0.59])

In [92]:
# Calculate fpr,tpr,threshholds
fpr,tpr,thresholds = roc_curve(y_test,y_probs_positive)

# Check the false positive rates
fpr

array([0.        , 0.        , 0.03448276, 0.03448276, 0.03448276,
       0.03448276, 0.03448276, 0.03448276, 0.10344828, 0.10344828,
       0.10344828, 0.13793103, 0.17241379, 0.17241379, 0.27586207,
       0.27586207, 0.31034483, 0.34482759, 0.34482759, 0.37931034,
       0.37931034, 0.4137931 , 0.4137931 , 0.44827586, 0.44827586,
       0.65517241, 0.65517241, 0.82758621, 0.96551724, 1.        ])