## ScikitLearn Notebook

In [44]:
what_were_learning = '''1. Getting data ready
                        2. Choosing a machine learning model
                        3. Fitting a model to the data and making predictions
                        4. Evaluating model predictions
                        5. Improving model predictions
                        6. Saving & Loading models'''

In [45]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

#### 1. Getting our data ready to be used with machine learning

Three main things we have to do:<br>
    1. Split the data into features and labels (usually 'X' and 'y')<br>
    2. Filling (also called imputing) or disregarding missing values<br>
    3. Converting non-numerical values to numeric values (also known as encoding)<br>

In [46]:
heart_disease = pd.read_csv("/home/hp1/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [47]:
X = heart_disease.drop("target", axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [48]:
y = heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [50]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
121,59,1,0,138,271,0,0,182,0,0.0,2,0,2
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
33,54,1,2,125,273,0,0,152,0,0.5,0,1,2
227,35,1,0,120,198,0,1,130,1,1.6,1,0,3
11,48,0,2,130,275,0,1,139,0,0.2,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,43,1,0,110,211,0,1,161,0,0.0,2,0,3
15,50,0,2,120,219,0,1,158,0,1.6,1,0,2
23,61,1,2,150,243,1,1,137,1,1.0,1,0,2
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3


In [51]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

#### 1.1 Making the data numerical

In [52]:
car_sales = pd.read_csv("/home/hp1/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/car-sales-extended.csv")

In [53]:
car_sales.shape

(1000, 5)

In [54]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [55]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [56]:
car_sales["Doors"].value_counts()
#we will considering Doors as a categorical attribute as it only has 3 types of values, that are '3,4&5'

4    856
5     79
3     65
Name: Doors, dtype: int64

In [57]:
#Split into X & y
X= car_sales.drop("Price",axis=1 )
y = car_sales["Price"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [58]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [59]:
#turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categories_feature = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot,categories_feature)], remainder="passthrough")

transformed_X= transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [60]:
transformed_X = pd.DataFrame(transformed_X)
type(transformed_X)

pandas.core.frame.DataFrame

In [61]:
transformed_X.value_counts()

0    1    2    3    4    5    6    7    8    9    10   11   12      
0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  10217.0     1
     1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  136279.0    1
                                                            116986.0    1
                                                            117907.0    1
                                                            120283.0    1
                                                                       ..
     0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  230314.0    1
                                                            230908.0    1
                                                            232912.0    1
                                                            234051.0    1
1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  201190.0    1
Length: 1000, dtype: int64

In [62]:
X.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
5,Honda,Red,42652,4
6,Toyota,Blue,163453,4
7,Honda,White,43120,4
8,Nissan,White,130538,4
9,Honda,Blue,51029,4


In [63]:
transformed_X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,42652.0
6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,163453.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,43120.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,130538.0
9,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,51029.0


In [64]:
#another way of doing the same thing
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [109]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()

In [110]:
#lets refit the model
# np.random.seed(30)
X_train,X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [111]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 13), (200, 13), (800,), (200,))

In [112]:

clf.fit(X_train,y_train)
clf.score(X_train, y_train)

0.8930616394876832

In [113]:
y_predicted = clf.predict(X_test)
y_predicted


array([15626.59, 22137.06, 17305.78, 11840.47, 22780.91, 16161.54,
       19934.88,  8793.64, 22159.3 , 19065.68, 23183.44, 12655.69,
       12750.44, 13306.51, 13077.79, 11356.22, 22333.45, 24977.5 ,
       13123.64,  9466.73, 14572.29, 11392.2 , 19218.73, 21029.72,
        9461.27, 13740.24,  6801.41, 10217.39, 12481.17, 13150.2 ,
       25315.16, 14721.64, 13260.17, 28195.  , 45577.02, 19806.4 ,
       16870.23, 25980.79, 34510.6 ,  8984.19,  9362.01, 27504.43,
       12512.54,  7888.7 , 26221.43, 13703.13, 10239.86,  7152.21,
       25078.31, 13724.56, 13266.74, 13460.65, 22086.37, 20019.54,
       10126.96, 13586.58, 16738.5 , 22152.94, 11053.48, 13843.03,
       10605.9 , 10726.64,  7304.52, 11028.55, 19648.7 , 23063.07,
       20226.42, 18517.77, 10780.56, 16817.77, 23442.81, 14114.7 ,
       14528.71, 38549.63, 20085.72, 15759.23, 15995.81, 21825.81,
       12839.74,  9652.83, 10637.37, 12166.66,  9936.63, 17473.56,
       23264.8 , 19157.01, 12302.29, 27584.8 , 12309.65, 16016

In [114]:
clf.score(X_test, y_test)

0.3127127677507633

In [117]:
# np.random.seed(14)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators")
    clf = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set : {clf.score(X_test, y_test)}\n")

Trying model with 10 estimators
Model accuracy on test set : 0.2988183425212363

Trying model with 20 estimators
Model accuracy on test set : 0.3072779643470469

Trying model with 30 estimators
Model accuracy on test set : 0.3039761934707256

Trying model with 40 estimators
Model accuracy on test set : 0.32163778905334617

Trying model with 50 estimators
Model accuracy on test set : 0.32408735875391637

Trying model with 60 estimators
Model accuracy on test set : 0.3287418700237871

Trying model with 70 estimators
Model accuracy on test set : 0.32670294493577845

Trying model with 80 estimators
Model accuracy on test set : 0.31259868598914486

Trying model with 90 estimators
Model accuracy on test set : 0.3340699813832705

