In [52]:
# it's machine learning library in python
# built on numpy and matplotlib
# has in-built machine learning models
# very well designed API

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# content
0. An end-to-end scikit learn workflow
1. Getting the data ready
2. Choose the right algorithm/estimator for our problem
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluating the model
5. Improve the model
6. save and load a trained model
7. putting it all together!

In [54]:
# let's listify the content
content = ["0. An end-to-end scikit learn workflow",
          "1. Getting the data ready",
          "2. Choose the right algorithm/estimator for our problem",
          "3. Fit the model/algorithm and use it to make predictions on our data Evaluating the model",
          "4. Improve the model",
          "5. save and load a trained model",
          "6. putting it all together!"]
content

['0. An end-to-end scikit learn workflow',
 '1. Getting the data ready',
 '2. Choose the right algorithm/estimator for our problem',
 '3. Fit the model/algorithm and use it to make predictions on our data Evaluating the model',
 '4. Improve the model',
 '5. save and load a trained model',
 '6. putting it all together!']

### 0. End to end scikit learn workflow
[just for complete overview before going step by step]

In [55]:
# 1. get the data ready
disease = pd.read_csv("heart-disease.csv")
disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [56]:
x =  disease.drop('target',axis =1) # feature matrix/data/variable
y = disease['target']   # labels

In [57]:
#2. choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# we will keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [58]:
# 3. Fit the model to the trainning data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)


In [59]:
clf.fit(x_train, y_train)

RandomForestClassifier()

In [60]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
70,54,1,2,120,258,0,0,147,0,0.4,1,0,3
177,64,1,2,140,335,0,1,158,0,0.0,2,0,2
284,61,1,0,140,207,0,0,138,1,1.9,2,1,3
29,53,1,2,130,197,1,0,152,0,1.2,0,0,2
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,51,0,2,130,256,0,0,149,0,0.5,2,0,2
229,64,1,2,125,309,0,1,131,1,1.8,1,0,3
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3
157,35,1,1,122,192,0,1,174,0,0.0,2,0,2


In [61]:
y_preds = clf.predict(x_test)
y_preds

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [62]:
y_test

186    0
101    1
131    1
129    1
106    1
      ..
198    0
242    0
24     1
275    0
235    0
Name: target, Length: 61, dtype: int64

In [63]:
# 4. evaluate the model on the training data and test data
clf.score(x_train, y_train)

1.0

In [64]:
clf.score(x_test, y_test)

0.8852459016393442

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88        30
           1       0.88      0.90      0.89        31

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.89        61
weighted avg       0.89      0.89      0.89        61



In [66]:
confusion_matrix(y_test, y_preds)

array([[26,  4],
       [ 3, 28]], dtype=int64)

In [67]:
accuracy_score(y_test,y_preds)

0.8852459016393442

In [68]:
# improve the model
# try different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"trying modelwith {i} esimators...")
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(x_train,y_train)
    print(f"model accuracy on test set: {clf.score(x_test,y_test)*100:.2f}%")
    print(" ")
    

trying modelwith 10 esimators...
model accuracy on test set: 78.69%
 
trying modelwith 20 esimators...
model accuracy on test set: 85.25%
 
trying modelwith 30 esimators...
model accuracy on test set: 81.97%
 
trying modelwith 40 esimators...
model accuracy on test set: 86.89%
 
trying modelwith 50 esimators...
model accuracy on test set: 83.61%
 
trying modelwith 60 esimators...
model accuracy on test set: 83.61%
 
trying modelwith 70 esimators...
model accuracy on test set: 81.97%
 
trying modelwith 80 esimators...
model accuracy on test set: 86.89%
 
trying modelwith 90 esimators...
model accuracy on test set: 80.33%
 


In [73]:
# 6. save a model and load it
import pickle
pickle.dump(clf, open("random_forest_model1.pkl",'wb'))

  pickle.dump(clf, open("random_forest_model1.pkl",'wb'))


In [74]:
loaded_model = pickle.load(open("random_forest_model1.pkl", "rb"))
loaded_model.score(x_test, y_test)

  loaded_model = pickle.load(open("random_forest_model1.pkl", "rb"))


0.8032786885245902

### warnings in jupyter notebook

In [71]:
import warnings
#warnings.filterwarnings('ignore')  # ignores the all warnings
warnings.filterwarnings('default') # displays warnings

In [72]:
# checking which scikit learn version you are using
import sklearn
sklearn.show_versions()


System:
    python: 3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]
executable: C:\Users\HP\anaconda3\python.exe
   machine: Windows-10-10.0.19041-SP0

Python dependencies:
          pip: 20.1.1
   setuptools: 49.2.0.post20200714
      sklearn: 0.23.1
        numpy: 1.18.5
        scipy: 1.5.0
       Cython: 0.29.21
       pandas: 1.0.5
   matplotlib: 3.2.2
       joblib: 0.16.0
threadpoolctl: 2.1.0

Built with OpenMP: True


## 1.0 Getting the data ready
1. split the data into features and labels(ususally x and y)
2. filling (also called imputing) or disregrding the missing values
3. converting non-numerical values to numerical values(a.k.a. feature encoding)

In [75]:
disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [77]:
x = disease.drop('target',axis=1)
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [78]:
y = disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [96]:
# split data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [92]:
disease.shape,x_train.shape,x_test.shape,y_train.shape,y_test.shape

((303, 14), (212, 13), (91, 13), (212,), (91,))

In [93]:
x.shape,y.shape

((303, 13), (303,))

In [95]:
print(303 * 0.7) # 70%   training set
print(303 * 0.3) # 30%  test set

212.1
90.89999999999999


### 1.1 converting data to numbers

In [99]:
sales = pd.read_csv('car-sales-extended.csv')
sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [100]:
sales.shape

(1000, 5)

In [101]:
sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [103]:
x = sales.drop('Price',axis=1)
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [104]:
y = sales['Price']
y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64

In [105]:
x.shape,y.shape

((1000, 4), (1000,))

In [107]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [110]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)

ValueError: could not convert string to float: 'Toyota'

In [113]:
sales["Doors"].value_counts() # can be converted to a category type

4    856
5     79
3     65
Name: Doors, dtype: int64

In [129]:
sales["Make"].value_counts()

Toyota    398
Honda     304
Nissan    198
BMW       100
Name: Make, dtype: int64

In [130]:
sales["Colour"].value_counts()

White    407
Blue     321
Black     99
Red       94
Green     79
Name: Colour, dtype: int64

In [124]:
# turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make","Colour","Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                  one_hot,categorical_features)],
                               remainder = 'passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [125]:
transformed_x.shape

(1000, 13)

In [127]:
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [128]:
pd.DataFrame(transformed_x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [134]:
dummies = pd.get_dummies(sales[['Make','Colour','Doors']])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [135]:
np.random.seed(42)
x_train,x_test,y_train,y_test = train_test_split(transformed_x,y,test_size=0.2)
model.fit(x_train,y_train)

RandomForestRegressor()

In [136]:
model.score(x_test,y_test)

0.3235867221569877