# 0. An End to End Scikit-Learn Workflow

In [1]:
# 1. Get the data ready
%matplotlib inline
import pandas as pd
import numpy as np
heart_disease = pd. read_csv("data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [2]:
# Create x (feature matrix)
x = heart_disease.drop("target", axis = 1)

# Create y (labels)
y = heart_disease["target"]
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [3]:
#2. Choose the right model and hyperparameters

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf. get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [4]:
#3. Fit the model to the tarining data

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [5]:
clf. fit(x_train, y_train)

RandomForestClassifier()

In [6]:
#4. Make a prediction

y_preds = clf.predict(x_test)
y_preds

array([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0], dtype=int64)

In [7]:
y_test

296    0
47     1
204    0
191    0
249    0
      ..
170    0
209    0
291    0
111    1
34     1
Name: target, Length: 61, dtype: int64

In [8]:
clf. score(x_train, y_train)

1.0

In [9]:
clf. score(x_test, y_test)

0.8032786885245902

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.81      0.76      0.79        29
           1       0.79      0.84      0.82        32

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61



In [11]:
confusion_matrix(y_test, y_preds)

array([[22,  7],
       [ 5, 27]], dtype=int64)

In [12]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [13]:
#5. Improve a model
# Trying different numbers of n_estimators

np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i). fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf. score(x_test, y_test) * 100: .2f}%")
    print('')

Trying model with 10 estimators...
Model accuracy on test set:  73.77%

Trying model with 20 estimators...
Model accuracy on test set:  83.61%

Trying model with 30 estimators...
Model accuracy on test set:  80.33%

Trying model with 40 estimators...
Model accuracy on test set:  78.69%

Trying model with 50 estimators...
Model accuracy on test set:  81.97%

Trying model with 60 estimators...
Model accuracy on test set:  80.33%

Trying model with 70 estimators...
Model accuracy on test set:  83.61%

Trying model with 80 estimators...
Model accuracy on test set:  83.61%

Trying model with 90 estimators...
Model accuracy on test set:  81.97%



In [14]:
#6. Save a model and load it

import pickle
pickle. dump(clf, open("random_forest_classifier_model_1.pkl", "wb"))

In [15]:
# if we want to read the model

loaded_model = pickle.load(open("random_forest_classifier_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

0.819672131147541

# Getting our data ready to be used in machine learning:
Three main steps to do:
    1. Split the data into features and labels (usually 'x' & 'y')
    2. Filling (also called inputting) or disregarding missing values
    3. Converting non-numerical values into numerical values (also called feature encoding)

In [16]:
heart_disease. head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [17]:
# Split the data into train and test sets
from sklearn. model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.2)

## Making sure its all numerical

In [18]:


car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales. head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [19]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [20]:
# Split into x,y
x = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]

#Split into training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)


In [21]:
### Build Machine Learning model

from sklearn. ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

ValueError: could not convert string to float: 'Toyota'

In [22]:
# Turn the categories into numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")

transformed_x = transformer.fit_transform(x)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [23]:
pd. DataFrame(transformed_x). head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [24]:
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [25]:
## Refit the model

np. random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size = 0.2)
model.fit(x_train, y_train)

RandomForestRegressor()

In [26]:
model.score(x_test, y_test)

0.3235867221569877

# What If there were missing values..?

In [27]:
# 1. Fill thm with some values (Imputation)

car_sales_missing = pd. read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [28]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

# Option 1: Fill missing data with pandas

In [29]:
# Fill the make column
car_sales_missing["Make"].fillna("missing", inplace = True)

# Fill the colour column
car_sales_missing["Colour"].fillna("missing", inplace = True)

# Fill the Odometer (km) column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"]. mean(), inplace = True)

# Fill the Doors column
car_sales_missing["Doors"].fillna(4, inplace = True)

car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [30]:
# Remove rows with missing price value
x = car_sales_missing.dropna(inplace = True)
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [31]:
x = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

In [32]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder = "passthrough")

transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

# Option 2: Fill missing values with scikit-learn

In [33]:
car_sales_missing = pd. read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [34]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [35]:
car_sales_missing.dropna(subset = ["Price"], inplace = True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [36]:
x = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill cat-egorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy = "constant", fill_value = "missing")
door_imputer = SimpleImputer(strategy= "constant", fill_value = 4)
num_imputer = SimpleImputer(strategy = "mean") 

# Define columns 
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer(somethong that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

# Transform the data
filled_x = imputer.fit_transform(x)
filled_x

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [38]:
car_sales_filled = pd. DataFrame(filled_x, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])

In [39]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder = "passthrough")

transformed_x = transformer.fit_transform(car_sales_filled)
transformed_x


<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [41]:
# Fit a model

np.random.seed(42)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(transformed_x,
                                                   y,
                                                   test_size = 0.2)
model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.21990196728583944

In [42]:
len(car_sales_filled), len(car_sales)

(950, 1000)

# 2. Choosing the right estimator/algorithm/ML model for our problem

# Classification - Predicting whether a sample is one thing or another

# Regression - Predictng a number

# Picking a ML model for a regression problem

In [43]:
# Import Boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [44]:
boston_df = pd.DataFrame(boston["data"], columns = boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [45]:
# Trying the ridge regression model

from sklearn.linear_model import Ridge

#set up random seed
np.random.seed(42)

#Create the data
x = boston_df.drop("target", axis = 1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

# Initiate Ridge
model = Ridge()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.6662221670168519

In [46]:
# Trying Random fores regressor

from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
x = boston_df.drop(["target"], axis =1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

rf = RandomForestRegressor()
rf. fit(x_train, y_train)
rf. score(x_test, y_test)

0.8654448653350507

# Choosing an esimator for classification problem

In [None]:
heart_disease = pd. read_csv("data/heart-disease.csv")
heart_disease. head()

# From the estimator map...
...
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
It says to try LinearSVC.
...

In [None]:
# Import LinearSVC estimator class
from sklearn.svm import LinearSVC 
# Set up random seed
np.random.seed(42)

x = heart_disease.drop(["target"], axis = 1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Initiate LinearSVC
clf = LinearSVC()
clf. fit(x_train, y_train)
clf. score(x_test, y_test)

In [None]:
# Trying Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier 
# Set up random seed
np.random.seed(42)

x = heart_disease.drop(["target"], axis = 1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Initiate RandomForestClassifier
clf = RandomForestClassifier()
clf. fit(x_train, y_train)
clf. score(x_test, y_test)

# 3. Fit the model/algorithm on our data and use it to make prediction
#  3.1 Fitting the model to the data

# Different names for :
     # x = features, features variable, data
     # y = labels, target, target variable

In [53]:
# Trying Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier 
# Set up random seed
np.random.seed(42)

x = heart_disease.drop(["target"], axis = 1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Initiate RandomForestClassifier
clf = RandomForestClassifier()
clf. fit(x_train, y_train)
clf. score(x_test, y_test)

0.8524590163934426

# 3.2 Make prediction using a machine learning model
# 2 Ways to make prediction:
 1. predict()
 2. predict_proba()

In [None]:
# Using a trained model to make predictions

In [47]:
x_test. head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
173,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04
274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53
491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52
452,5.09017,0.0,18.1,0.0,0.713,6.297,91.8,2.3682,24.0,666.0,20.2,385.09,17.27


In [48]:
clf. predict(x_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [49]:
np. array(y_test)

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8, 21.5,
       18.9,  7. , 21.2, 18.5, 29.8, 18.8, 10.2, 50. , 14.1, 25.2, 29.1,
       12.7, 22.4, 14.2, 13.8, 20.3, 14.9, 21.7, 18.3, 23.1, 23.8, 15. ,
       20.8, 19.1, 19.4, 34.7, 19.5, 24.4, 23.4, 19.7, 28.2, 50. , 17.4,
       22.6, 15.1, 13.1, 24.2, 19.9, 24. , 18.9, 35.4, 15.2, 26.5, 43.5,
       21.2, 18.4, 28.5, 23.9, 18.5, 25. , 35.4, 31.5, 20.2, 24.1, 20. ,
       13.1, 24.8, 30.8, 12.7, 20. , 23.7, 10.8, 20.6, 20.8,  5. , 20.1,
       48.5, 10.9,  7. , 20.9, 17.2, 20.9,  9.7, 19.4, 29. , 16.4, 25. ,
       25. , 17.1, 23.2, 10.4, 19.6, 17.2, 27.5, 23. , 50. , 17.9,  9.6,
       17.2, 22.5, 21.4])

In [54]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf. predict(x_test)
np. mean(y_preds == y_test)


0.8524590163934426

In [55]:
clf.score(x_test, y_test)

0.8524590163934426

In [56]:
# Another way:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

In [57]:
# make prediction with predict_proba(); It returns probabilities of classification label
clf.predict_proba(x_test[:5])


array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [None]:
clf.predict(x_test[:5])

In [60]:
# For boston Data
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
x = boston_df.drop(["target"], axis =1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

rf = RandomForestRegressor()
rf. fit(x_train, y_train)
rf. score(x_test, y_test)

0.8654448653350507

In [61]:
y_preds = rf. predict(x_test)

In [62]:
y_preds[:10]

array([23.081, 30.574, 16.759, 23.46 , 16.893, 21.644, 19.113, 15.334,
       21.14 , 20.639])

In [63]:
np.array(y_test[:10])

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [64]:
# Compare the prediction to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

2.136382352941176

# 4. Evaluating a Machine learning model 
    # 3 ways to evaluate Scikit-Learn models/estimators
    1. Estimator 'score' method
    2. The scoring parameter
    3. Problem specific metric functions
  4.1: Evaluating the model with the score method  

In [85]:
from sklearn.ensemble import RandomForestClassifier

np.random. seed(42)

x = heart_disease. drop("target", axis = 1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

clf = RandomForestClassifier()
clf. fit(x_train, y_train)

RandomForestClassifier()

In [86]:
clf. score(x_test, y_test)

In [90]:
# For boston Data
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
x = boston_df.drop(["target"], axis =1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

rf = RandomForestRegressor()
rf. fit(x_train, y_train)
rf. score(x_test, y_test)

0.8654448653350507

# 4.2 Evaluating the model with the scoring parameter

In [95]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
np.random. seed(42)

x = heart_disease. drop("target", axis = 1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

clf = RandomForestClassifier()
clf. fit(x_train, y_train)

RandomForestClassifier()

In [96]:
clf. score(x_test, y_test)

0.8524590163934426

In [97]:
cross_val_score(clf, x, y)

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])