In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
!pip install scikit-learn



# An end-to-end Scikit-Learn workflow
Heart disease

In [3]:
# 1 Get the data ready
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# Create X (features matrix)
X = heart_disease.drop('target', axis=1)

# Create y (labels)
y = heart_disease['target']

In [5]:
heart_disease.drop('target', axis=1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [6]:
heart_disease['target']

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [7]:
# 2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf = RandomForestClassifier(n_estimators=60)

# We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 60,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [9]:
# 3. Fit the model to the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [11]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, **options)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also Non

In [12]:
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=60)

In [13]:
# Make predictions on our data
y_preds = clf.predict(X_test)
y_preds

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [14]:
# Evaluate the model on the training data and test data
clf.score(X_train, y_train)

1.0

In [15]:
clf.score(X_test, y_test)

0.8032786885245902

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75        24
           1       0.84      0.84      0.84        37

    accuracy                           0.80        61
   macro avg       0.79      0.79      0.79        61
weighted avg       0.80      0.80      0.80        61



In [17]:
confusion_matrix(y_test, y_preds)

array([[18,  6],
       [ 6, 31]], dtype=int64)

In [18]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [19]:
# Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {clf.score(X_test, y_test)}')

Trying model with 10 estimators...
Model accuracy on test set: 0.7704918032786885
Trying model with 20 estimators...
Model accuracy on test set: 0.7704918032786885
Trying model with 30 estimators...
Model accuracy on test set: 0.7704918032786885
Trying model with 40 estimators...
Model accuracy on test set: 0.8032786885245902
Trying model with 50 estimators...
Model accuracy on test set: 0.7868852459016393
Trying model with 60 estimators...
Model accuracy on test set: 0.7704918032786885
Trying model with 70 estimators...
Model accuracy on test set: 0.7868852459016393
Trying model with 80 estimators...
Model accuracy on test set: 0.7868852459016393
Trying model with 90 estimators...
Model accuracy on test set: 0.8032786885245902


In [20]:
# Save a model and load it
import pickle

pickle.dump(clf, open('random_forst_model_1.pkl', 'wb'))

In [21]:
loaded_model = pickle.load(open('random_forst_model_1.pkl', 'rb'))
loaded_model.score(X_test, y_test)

0.8032786885245902

In [22]:
# Getting our data ready to be used with machine learning

heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Cars sales

1.1 Make sure it's all numerical

In [23]:
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [24]:
import os

items = os.listdir(".")

newlist = []
for names in items:
    if names.endswith(".csv"):
        newlist.append(names)
print(newlist)

['car-sales-extended-missing-data.csv', 'car-sales-extended.csv', 'car-sales-missing-data.csv', 'car-sales.csv', 'dog-vision-full-model-predictions-with-mobilenetV2.csv', 'dog-vision-prediction-probabilites-array.csv', 'exported-car-sales.csv', 'exported-patient-data.csv', 'heart-disease.csv']


In [25]:
len(car_sales)

1000

In [26]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [27]:
# My version
car_makes = car_sales['Make'].unique().tolist()
make_numeric = 0
dict_key_makes = {}
for car_make in car_makes:
    dict_key_makes[car_make] = make_numeric
    result = car_sales['Make'].replace(car_make, make_numeric)
    make_numeric += 1
    car_sales['Make'] = result
print(car_sales['Make'].unique().tolist())
print(car_sales.head())
print(dict_key_makes)

[0, 1, 2, 3]
   Make Colour  Odometer (KM)  Doors  Price
0     0  White          35431      4  15323
1     1   Blue         192714      5  19943
2     0  White          84714      4  28343
3     2  White         154365      4  13434
4     3   Blue         181577      3  14043
{'Honda': 0, 'BMW': 1, 'Toyota': 2, 'Nissan': 3}


In [29]:
# My version
car_colors = car_sales['Colour'].unique().tolist()
color_numeric = 0
dict_key_colors = {}
for car_color in car_colors:
    dict_key_colors[car_color] = color_numeric
    result = car_sales['Colour'].replace(car_color, color_numeric)
    color_numeric += 1
    car_sales['Colour'] = result
print(car_sales['Colour'].unique().tolist())
print(car_sales.head())


[0, 1, 2, 3, 4]
   Make  Colour  Odometer (KM)  Doors  Price
0     0       0          35431      4  15323
1     1       1         192714      5  19943
2     0       0          84714      4  28343
3     2       0         154365      4  13434
4     3       1         181577      3  14043


In [30]:
# Split into X/y
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
# Build machine learning model
from sklearn.ensemble import  RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.17916328340089804

# Preprocessing from Sklearn
UDEMY version
# Turn categories into numbers

In [32]:
# Turn the cati
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')

transformed_X = transformer.fit_transform(X)
transformed_X

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [33]:
car_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [34]:
car_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [35]:
X = car_missing.drop('Price', axis=1)
y = car_missing['Price']

# Fill missing data whith Pandas

In [36]:
# Fill the Make column
car_missing['Make'].fillna('missing', inplace=True)

# Fill the Colour column
car_missing['Colour'].fillna('missing', inplace=True)

# Fill the Odometer (KM) column
car_missing['Odometer (KM)'].fillna(car_missing['Odometer (KM)'].mean(), inplace=True)

# Fill the Doors column
car_missing['Doors'].fillna(4, inplace=True)

In [37]:
# Remove rows with missing Price value
car_missing.dropna(inplace=True)

In [38]:
X = car_missing.drop('Price', axis=1)
y = car_missing['Price']

In [39]:
categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')

transformed_X = transformer.fit_transform(X)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>