## Sciket-learn intro.
Topics:

0. End-to-End sklearn workflow
1. Getting data ready
2. Choosing right estimator/algorithm
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluating model
5. Improve a model
6. Save and Load a trained model
7. Putting it all together.

In [35]:
topics = [
    "0. End-to-End sklearn workflow",
    "1. Getting data ready",
    "2. Choosing right estimator/algorithm",
    "3. Fit the model/algorithm and use it to make predictions on our data",
    "4. Evaluating model",
    "5. Improve a model",
    "6. Save and Load a trained model",
    "7. Putting it all together."]

In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



## 0.sklearn.model_selection sklearn workflow

In [29]:
import numpy as np

In [2]:
import pandas as pd
data = pd.read_csv("../data/heart-disease.csv")
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# Getting data ready

In [5]:
X = data.drop("target" , axis=1) # Create X (Features matrix)
y = data["target"]

# Choosing right estimator/algorithm

In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# Fit the model/algorithm and use it to make predictions on our data

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [17]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
104,50,1,2,129,196,0,1,163,0,0.0,2,0,2
90,48,1,2,124,255,1,1,175,0,0.0,2,2,2
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
227,35,1,0,120,198,0,1,130,1,1.6,1,0,3
10,54,1,0,140,239,0,1,160,0,1.2,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,54,0,2,160,201,0,1,163,0,0.0,2,1,2
96,62,0,0,140,394,0,0,157,0,1.2,1,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
284,61,1,0,140,207,0,0,138,1,1.9,2,1,3


In [18]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0], dtype=int64)

# Evaluate the model

In [19]:
clf.score(X_train, y_train)

1.0

In [20]:
clf.score(X_test, y_test)

0.819672131147541

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.65      0.76        26
           1       0.79      0.94      0.86        35

    accuracy                           0.82        61
   macro avg       0.84      0.80      0.81        61
weighted avg       0.83      0.82      0.81        61



In [25]:
confusion_matrix(y_test, y_pred)

array([[17,  9],
       [ 2, 33]], dtype=int64)

In [26]:
accuracy_score(y_test, y_pred)

0.819672131147541

# Improve a model

In [30]:
#Trying different estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimator...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model Accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimator...
Model Accuracy on test set: 80.33%

Trying model with 20 estimator...
Model Accuracy on test set: 80.33%

Trying model with 30 estimator...
Model Accuracy on test set: 77.05%

Trying model with 40 estimator...
Model Accuracy on test set: 80.33%

Trying model with 50 estimator...
Model Accuracy on test set: 78.69%

Trying model with 60 estimator...
Model Accuracy on test set: 78.69%

Trying model with 70 estimator...
Model Accuracy on test set: 78.69%

Trying model with 80 estimator...
Model Accuracy on test set: 80.33%

Trying model with 90 estimator...
Model Accuracy on test set: 83.61%



# Save and Load a trained model

In [31]:
import pickle
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [33]:
load_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
load_model.score(X_test, y_test)

0.8360655737704918

# 1. Getting data ready to be used with machine learning

Three main things to do:

    1. Split the data into features and labels (usually `X` and `y`)
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values to numerical values (Feature encoding)

In [38]:
data = pd.read_csv("../data/heart-disease.csv")
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [39]:
X = data.drop("target" ,axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [41]:
y = data["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [42]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
Xtrain, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [43]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

# working with non numerical dataset

In [44]:
data = pd.read_csv("../data/car-sales-extended.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [45]:
# split into X/y
X = data.drop("Price", axis = 1)
y = data["Price"]

In [46]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [47]:
#build ml model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: could not convert string to float: 'Toyota'

In [48]:
data["Doors"].value_counts() # doors are also categorical data coz many cars falls in this category

4    856
5     79
3     65
Name: Doors, dtype: int64

In [49]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [51]:
# turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                                 remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [52]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [53]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model.fit(X_train, y_train)

RandomForestRegressor()

In [54]:
model.score(X_test, y_test)

0.3235867221569877

## missing values
1. Fill them with some value(imputation)
2. remove the samples with missing data

In [55]:
#import car sales missing
data = pd.read_csv("../data/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [56]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [57]:
# split into X/y
X = data.drop("Price", axis = 1)
y = data["Price"]

In [58]:
# non numerical to numerical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                                 remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

ValueError: Input contains NaN

#### fill missing data with pandas (option 1)

In [59]:
data["Doors"].value_counts()

4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64

In [61]:
# fill the make column
data["Make"].fillna("missing", inplace=True)

#fill colour column
data["Colour"].fillna("missing", inplace=True)

# fill the odometer
data["Odometer (KM)"].fillna(data["Odometer (KM)"].mean(), inplace=True)

#fill the dorrs
data["Doors"].fillna(4, inplace=True)

In [62]:
#check out df again
data.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [63]:
#remove rows with missing price values
data.dropna(inplace=True)

In [64]:
data.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [65]:
len(data)

950

In [66]:
# split into X/y
X = data.drop("Price", axis = 1)
y = data["Price"]

In [68]:
# non numerical to numerical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                                 remainder="passthrough")

transformed_X = transformer.fit_transform(data)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

### fill missing values with sklearn

In [69]:
#import car sales missing
data = pd.read_csv("../data/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [70]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [71]:
data.dropna(subset=["Price"], inplace=True)
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [74]:
# split into X/y
X = data.drop("Price", axis=1)
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [73]:
y = data["Price"]

In [75]:
# filling missing values with sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#fill categorical values with missing and numerical with mean values
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

#define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

#create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

#transformed data
filled_X = imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [77]:
df = pd.DataFrame(filled_X,
                  columns=["Make", "Colours", "Doors", "Odometer (KM)"])
df

Unnamed: 0,Make,Colours,Doors,Odometer (KM)
0,Honda,White,4,35431
1,BMW,Blue,5,192714
2,Honda,White,4,84714
3,Toyota,White,4,154365
4,Nissan,Blue,3,181577
...,...,...,...,...
945,Toyota,Black,4,35820
946,missing,White,3,155144
947,Nissan,Blue,4,66604
948,Honda,White,4,215883


In [78]:
df.isna().sum()

Make             0
Colours          0
Doors            0
Odometer (KM)    0
dtype: int64

In [None]:
# # split into X/y
# X = df.drop("Price", axis = 1)
# y = df["Price"]

In [80]:
# non numerical to numerical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colours", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                                 remainder="passthrough")

transformed_X = transformer.fit_transform(df)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [81]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.21990196728583944

In [82]:
#Trying different estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimator...")
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f"Model Accuracy on test set: {model.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimator...
Model Accuracy on test set: 14.44%

Trying model with 20 estimator...
Model Accuracy on test set: 21.57%

Trying model with 30 estimator...
Model Accuracy on test set: 19.75%

Trying model with 40 estimator...
Model Accuracy on test set: 18.15%

Trying model with 50 estimator...
Model Accuracy on test set: 22.63%

Trying model with 60 estimator...
Model Accuracy on test set: 19.60%

Trying model with 70 estimator...
Model Accuracy on test set: 21.74%

Trying model with 80 estimator...
Model Accuracy on test set: 19.02%

Trying model with 90 estimator...
Model Accuracy on test set: 18.81%



# Choosing right estimator/algorithm

There are different ml models choosing one is important to start with
* Classification - predicting weather a sample is one thing or another
* Regression - predicting a number

![](estimator.png)

### 2.1 picking a ml model for our classification problem.

In [7]:
data = pd.read_csv("../data/heart-disease.csv")
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [10]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [8]:
np.random.seed(29)

In [9]:
X = data.drop("target", axis=1)
y = data["target"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LinearSVC(max_iter=100000)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)



0.7377049180327869

In [15]:
data["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [17]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(29)
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# clf = LinearSVC(max_iter=100000)
clf = RandomForestClassifier(n_estimators=500)

clf.fit(X_train, y_train)

clf.score(X_test, y_test)


0.9016393442622951

In [None]:
for i in range(10, 600, 10):
    print(f"Trying {i} estimators")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy {clf.score(X_test, y_test) * 100:.2f}%")

# Fit the model/algorithm and use it to make predictions on our data

In [22]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(29)

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# clf = LinearSVC(max_iter=100000)
clf = RandomForestClassifier(n_estimators=500)

# fitting or trainging model
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.9016393442622951

## Makeing predictions using our ml model
Two ways to make predictions:

1. `predict()`
2. `predict_proba()`

In [24]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
192,54,1,0,120,188,0,1,113,0,1.4,1,1,3
40,51,0,2,140,308,0,0,142,0,1.5,2,1,2
175,40,1,0,110,167,0,0,114,1,2.0,1,0,3
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3
127,67,0,2,152,277,0,1,172,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,35,1,1,122,192,0,1,174,0,0.0,2,0,2
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
137,62,1,1,128,208,1,0,140,0,0.0,2,0,2
98,43,1,2,130,315,0,1,162,0,1.9,2,1,2


In [25]:
#use a trained model to make predictions
clf.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

In [27]:
np.array(y_test)

array([0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [28]:
y_pred = clf.predict(X_test)
np.mean(y_pred == y_test)

0.9016393442622951

#### `predict_proba()`

In [29]:
clf.predict_proba(X_test[:5])

array([[0.876, 0.124],
       [0.268, 0.732],
       [0.814, 0.186],
       [0.694, 0.306],
       [0.164, 0.836]])

In [31]:
clf.predict(X_test[:5])

array([0, 1, 0, 0, 1], dtype=int64)

# Evaluating model
Three ways to evaluate sklearn estimators/models:
1. Estimator `score` method
2. The `scoring` parameter
3. Problem-specific metric functions.
### Evaluating a model with the `score` method

In [32]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(29)

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# clf = LinearSVC(max_iter=100000)
clf = RandomForestClassifier(n_estimators=500)

# fitting or trainging model
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500)

In [33]:
clf.score(X_test, y_test)

0.9016393442622951

#### Evaluating with `scoring` parameter

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(29)

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# clf = LinearSVC(max_iter=100000)
clf = RandomForestClassifier(n_estimators=500)

# fitting or trainging model
clf.fit(X_train, y_train);

In [35]:
clf.score(X_test, y_test)

0.9016393442622951

In [37]:
cross_val_score(clf, X, y,cv = 10)

array([0.90322581, 0.83870968, 0.83870968, 0.9       , 0.9       ,
       0.86666667, 0.76666667, 0.83333333, 0.73333333, 0.8       ])

In [39]:
cross_val_score(clf, X, y,cv = 10, scoring=None)

array([0.90322581, 0.80645161, 0.83870968, 0.9       , 0.9       ,
       0.8       , 0.73333333, 0.83333333, 0.73333333, 0.76666667])

### Classification model evaluation metrics
1. Accuracy
2. Area under ROC curve
3. confusion matrix
4. classification report

**Accuracy**

In [40]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(29)

X = data.drop("target", axis=1)
y = data["target"]

# No nedd to split data coz v r using cross val score.

clf = RandomForestClassifier()
cross_val_score(clf, X, y, cv=5)

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.75      ])

#### 2. Area under ROC(reciver operating characteristic curve)
* Area Under Curve (AUC)
* ROC

In [42]:
from sklearn.metrics import roc_curve

clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)
y_prob[:10]

array([[0.89, 0.11],
       [0.25, 0.75],
       [0.8 , 0.2 ],
       [0.74, 0.26],
       [0.16, 0.84],
       [0.96, 0.04],
       [0.31, 0.69],
       [0.01, 0.99],
       [0.77, 0.23],
       [0.22, 0.78]])

In [43]:
y_prob_pos = y_prob[:, 1]
y_prob_pos

array([0.11, 0.75, 0.2 , 0.26, 0.84, 0.04, 0.69, 0.99, 0.23, 0.78, 0.62,
       0.46, 0.2 , 0.21, 0.75, 0.3 , 0.52, 0.82, 0.35, 0.93, 0.7 , 0.95,
       0.17, 0.8 , 0.19, 0.58, 0.63, 0.09, 0.51, 0.67, 0.65, 0.69, 0.99,
       0.7 , 0.75, 0.88, 0.94, 0.46, 0.93, 0.07, 0.13, 0.85, 0.05, 0.66,
       0.83, 0.07, 0.96, 0.19, 0.93, 0.27, 0.14, 0.77, 0.99, 0.93, 0.12,
       0.03, 0.98, 0.61, 0.86, 0.72, 0.22])

In [44]:
# fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob_pos)
fpr

array([0.        , 0.        , 0.        , 0.03448276, 0.03448276,
       0.06896552, 0.06896552, 0.06896552, 0.10344828, 0.10344828,
       0.17241379, 0.17241379, 0.20689655, 0.27586207, 0.44827586,
       0.44827586, 0.48275862, 0.62068966, 0.82758621, 0.89655172,
       1.        ])

## Improve a model

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [3]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Tuning hyper-parameter by hand
* Making three sets training for training model, validation for tuning hyperparameters, test fot evaluating/testing the model accuracy. 

![](sklearn-train-valid-test-annotated.png)

In [4]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
grid = {"n_estimators" : [10, 100, 200, 500, 1000, 1200],
        "max_depth" : [None, 5, 10, 20, 30],
        "max_features" : ["auto", "sqrt"],
        "min_samples_split" : [2, 4, 6],
        "min_samples_leaf" : [1, 2, 4]}

np.random.seed(29)

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10, #No. of models to try
                            cv=5,
                            verbose = 2)

rs_clf.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5, total=   1.3s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5, total=   0.9s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5, total=   1.1s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5, total=   1.0s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=5, total=   0.9s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   0.5s
[CV] n_estimators=100, min_samples_split=2, min_samples_lea

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.2min finished


In [14]:
rs_clf.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 5}

In [16]:
rs_clf.score(X_test, y_test)

0.8688524590163934

In [19]:
grid

{'n_estimators': [10, 100, 200, 500, 1000, 1200],
 'max_depth': [None, 5, 10, 20, 30],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [2, 4, 6],
 'min_samples_leaf': [1, 2, 4]}

In [20]:
grid_2 = {'n_estimators': [100, 200, 500],
         'max_depth': [None, 5, 10,],
         'max_features': ['auto', 'sqrt'],
         'min_samples_split': [2, 4, 6],
         'min_samples_leaf': [2, 4]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
np.random.seed(29)

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

gs_clf = GridSearchCV(estimator=clf,
                      param_grid=grid_2,
                      cv=5,
                      verbose = 2)

gs_clf.fit(X_train, y_train);

In [22]:
gs_clf.best_params_

{'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'n_estimators': 200}

In [23]:
gs_clf.score(X_test, y_test)

0.8852459016393442