# Introduction to Scikit-learn (sklearn)

Things to cover:

    0) End to End scikit workflow
    1) Getting the data ready
    2) Learn to choose the right estimator for our problems
    3) fit the model to the correct algorithm to make predictions
    4) Evaluate models
    5) Improve models
    6) Save and load a trained model
    7) Assemble!

# 0) END TO END SCIKIT WORKFLOW

In [17]:
#END TO END SCIKIT WORKFLOW

#get the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# 1) GETTING THE DATA READY

In [18]:
#Create X (features matrix) 
X = heart_disease.drop("target", axis=1)

#Create y labels
y = heart_disease["target"]


# 2) CHOOSING THE RIGHT MODEL

In [19]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# keeping the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [20]:
#fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# 3) Fit the model

In [25]:
clf.fit(X_train, y_train)
#y_label = clf.predict(np.array([0,2,3,4]))
y_pred = clf.predict(X_test)
y_pred

array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1])

# 4) Evalutate the Model

In [32]:
#4 evalutate the model
clf.score(X_train,y_train)
clf.score(X_test,y_test)

0.8852459016393442

In [37]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.84      0.88        31
           1       0.85      0.93      0.89        30

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



In [42]:
confusion_matrix(y_test, y_pred)

array([[26,  5],
       [ 2, 28]])

In [44]:
accuracy_score(y_test, y_pred)

0.8852459016393442

# 5) Improve a model

In [49]:
#Try a different amount of n_estimators

np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators... ")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}")

Trying model with 10 estimators... 
Model accuracy on test set: 85.25
Trying model with 20 estimators... 
Model accuracy on test set: 83.61
Trying model with 30 estimators... 
Model accuracy on test set: 85.25
Trying model with 40 estimators... 
Model accuracy on test set: 88.52
Trying model with 50 estimators... 
Model accuracy on test set: 85.25
Trying model with 60 estimators... 
Model accuracy on test set: 85.25
Trying model with 70 estimators... 
Model accuracy on test set: 86.89
Trying model with 80 estimators... 
Model accuracy on test set: 86.89
Trying model with 90 estimators... 
Model accuracy on test set: 86.89


# Save the model and load it

In [56]:
import pickle
#this creates a .pkl file that is a copy of our model
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.8688524590163934

# Now, a more concrete example

In [66]:
#standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

## 1) Getting our data ready to be used by ML
### The three main things we have to do:
    -Split the data into features and labels (usually 'X' for features and 'y' for labels)
    -Filling (and also inputting ) or disregarding missing values
    -Converting non numerical values to numerical values (also called feature encoding)
    

In [60]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [65]:
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [72]:
#split the data into training and test sets
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
X_test


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
153,66,0,2,146,278,0,0,152,0,0.0,1,1,2
80,41,1,2,112,250,0,1,179,0,0.0,2,0,2
50,51,0,2,130,256,0,0,149,0,0.5,2,0,2
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2
198,62,1,0,120,267,0,1,99,1,1.8,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1
235,51,1,0,140,299,0,1,173,1,1.6,2,0,3
96,62,0,0,140,394,0,0,157,0,1.2,1,0,2
189,41,1,0,110,172,0,0,158,0,0.0,2,0,3


In [80]:
# 1.1 need to make sure it's all numerical
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [81]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [84]:
#split into X/y (aka features/labels)
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

#split into train/test
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2)

In [86]:
#build ML model
from sklearn.ensemble import RandomForestRegressor
#regressors pick numbers instead of classifications

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test,y_test)


ValueError: could not convert string to float: 'Honda'

In [89]:
#to deal with the fact that the makes are strings of the car manufactures, we will convert into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =  ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [93]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [98]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [101]:
#now we can try to refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model.fit(X_train,y_train)


RandomForestRegressor()

In [103]:
model.score(X_test, y_test)

0.3235867221569877

### 1.2) What to do with missing values?

    -Fill the missing values with some value (known as imputation)
    -Remove the samples with missing values altogether
    

In [111]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.isnull().sum()



Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [112]:
#Create features and labels
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [115]:
#convert missing values to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =  ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

#### to get the above to work, we need to deal with missing data with Pandas

In [119]:
# option 1: fill the missing data using pandas

#fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

#fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

#fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

#fill the "Doors" column, we choose 4 because it's the mode of the data
car_sales_missing["Doors"].fillna(4, inplace=True)

car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [120]:
#remove rows with missing price values
car_sales_missing.dropna(inplace=True)

In [121]:
len(car_sales_missing)

950

In [129]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =  ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

transformed_X = transformer.fit_transform(X)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

## Option 2: Filling missing data with Scikit-LEARN