## ScikitLearn Notebook

In [1]:
what_were_learning = '''1. Getting data ready
                        2. Choosing a machine learning model
                        3. Fitting a model to the data and making predictions
                        4. Evaluating model predictions
                        5. Improving model predictions
                        6. Saving & Loading models'''

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

#### 1. Getting our data ready to be used with machine learning

Three main things we have to do:<br>
    1. Split the data into features and labels (usually 'X' and 'y')<br>
    2. Filling (also called imputing) or disregarding missing values<br>
    3. Converting non-numerical values to numeric values (also known as encoding)<br>

In [3]:
heart_disease = pd.read_csv("/home/hp1/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
X = heart_disease.drop("target", axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [5]:
y = heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [7]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
53,44,0,2,108,141,0,1,175,0,0.6,1,0,2
212,39,1,0,118,219,0,1,140,0,1.2,1,0,3
45,52,1,1,120,325,0,1,172,0,0.2,2,0,2
65,35,0,0,138,183,0,1,182,0,1.4,2,0,2
260,66,0,0,178,228,1,1,165,1,1.0,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,40,1,0,152,223,0,1,181,0,0.0,2,0,3
46,44,1,2,140,235,0,0,180,0,0.0,2,0,2
116,41,1,2,130,214,0,0,168,0,2.0,1,0,2
43,53,0,0,130,264,0,0,143,0,0.4,1,0,2


In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

#### 1.1 Making the data numerical

In [9]:
car_sales = pd.read_csv("/home/hp1/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/car-sales-extended.csv")

In [10]:
car_sales.shape

(1000, 5)

In [11]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [12]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [13]:
car_sales["Doors"].value_counts()
#we will considering Doors as a categorical attribute as it only has 3 types of values, that are '3,4&5'

4    856
5     79
3     65
Name: Doors, dtype: int64

In [14]:
#Split into X & y
X= car_sales.drop("Price",axis=1 )
y = car_sales["Price"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [16]:
#turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categories_feature = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot,categories_feature)], remainder="passthrough")

transformed_X= transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [17]:
transformed_X = pd.DataFrame(transformed_X)
type(transformed_X)

pandas.core.frame.DataFrame

In [18]:
transformed_X.value_counts()

0    1    2    3    4    5    6    7    8    9    10   11   12      
0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  10217.0     1
     1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  136279.0    1
                                                            116986.0    1
                                                            117907.0    1
                                                            120283.0    1
                                                                       ..
     0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  230314.0    1
                                                            230908.0    1
                                                            232912.0    1
                                                            234051.0    1
1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  201190.0    1
Length: 1000, dtype: int64

In [19]:
X.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
5,Honda,Red,42652,4
6,Toyota,Blue,163453,4
7,Honda,White,43120,4
8,Nissan,White,130538,4
9,Honda,Blue,51029,4


In [20]:
transformed_X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,42652.0
6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,163453.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,43120.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,130538.0
9,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,51029.0


In [21]:
#another way of doing the same thing
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [22]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()

In [23]:
#lets refit the model
# np.random.seed(30)
X_train,X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 13), (200, 13), (800,), (200,))

In [25]:

clf.fit(X_train,y_train)
clf.score(X_train, y_train)

0.8966382872102512

In [26]:
y_predicted = clf.predict(X_test)
y_predicted


array([15643.11, 15404.19, 11055.43,  7610.41, 15358.14, 15351.85,
        9090.05, 15578.  , 12177.82, 12828.36, 13901.32, 10842.14,
       13803.92, 17150.14, 10424.57,  8750.65, 13730.38, 10435.46,
       11355.46, 10139.8 , 21963.52, 11935.35, 19893.74, 13384.33,
       10653.51, 12054.8 , 43519.56, 20699.38, 14453.08, 21909.55,
       37919.22, 19143.49, 12418.24,  8884.62, 11533.74, 10837.17,
       23140.52, 19399.63, 11111.6 , 25386.27, 19765.06, 33614.16,
       10982.7 , 14286.75, 14577.94,  9535.25, 21166.85, 10820.1 ,
       23348.62, 10382.82, 15881.93, 10388.4 , 26442.78, 20384.73,
        9698.6 , 24482.6 , 18040.39, 22874.84, 11229.41, 13468.41,
        8279.31,  9595.4 ,  9959.4 , 11066.83, 22552.11, 12001.02,
       11384.49, 19453.78,  9894.95, 16300.47, 21488.33,  7969.42,
       11566.88, 15431.97, 12349.84, 12791.11, 22465.62, 11835.09,
       12069.49, 14495.05, 21447.35, 17660.97, 24855.85, 10560.84,
       15243.2 , 14685.61, 17998.16, 38792.98, 11037.61, 24126

In [27]:
clf.score(X_test, y_test)

0.31202494110829293

In [28]:
# np.random.seed(14)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators")
    clf = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set : {clf.score(X_test, y_test)}\n")

Trying model with 10 estimators
Model accuracy on test set : 0.29784695886242274

Trying model with 20 estimators
Model accuracy on test set : 0.2626714897780399

Trying model with 30 estimators
Model accuracy on test set : 0.3137471325908544

Trying model with 40 estimators
Model accuracy on test set : 0.3010850927873333

Trying model with 50 estimators
Model accuracy on test set : 0.30478212679641514

Trying model with 60 estimators
Model accuracy on test set : 0.2819213517426661

Trying model with 70 estimators
Model accuracy on test set : 0.3180484807598385

Trying model with 80 estimators
Model accuracy on test set : 0.29439091194368794

Trying model with 90 estimators
Model accuracy on test set : 0.3014774915816587



#### 1.2 Dealing with missing values
Two ways:<br>
    1. Fill them with some value (also known as Imputation) <br>
    2. Remove the samples with missing data altogether

In [29]:
car_sales_missing = pd.read_csv("/home/hp1/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/car-sales-extended-missing-data.csv")

In [30]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [31]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [32]:
#Create X & y
X = car_sales_missing.drop("Price", axis=1)
X.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [33]:
y = car_sales_missing["Price"]
y.head()

0    15323.0
1    19943.0
2    28343.0
3    13434.0
4    14043.0
Name: Price, dtype: float64

In [34]:
#turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categories_feature = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot,categories_feature)], remainder="passthrough")

transformed_X= transformer.fit_transform(X)
transformed_X



##### OneHotEncoder couldnt handle missing values in versions below (0.23). i have this piece of code here to explain the same.
##### Here, it won't give us any error. we're good to go

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

##### Option 1: Fill missing data with pandas

In [35]:
car_sales_missing["Doors"].value_counts()

4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64

In [36]:
#fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

#fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

#fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean , inplace=True)

#fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace=True)



In [37]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [38]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [39]:
#remove rows with missing price value
car_sales_missing.dropna(inplace=True)

In [40]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [41]:
len(car_sales_missing)

950

In [42]:
#get new X & y for the missing values
X= car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [43]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categories_feature = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot,categories_feature)], remainder="passthrough")

transformed_X= transformer.fit_transform(car_sales_missing)
transformed_X

array([[0.0, 1.0, 0.0, ..., 0.0, 35431.0, 15323.0],
       [1.0, 0.0, 0.0, ..., 1.0, 192714.0, 19943.0],
       [0.0, 1.0, 0.0, ..., 0.0, 84714.0, 28343.0],
       ...,
       [0.0, 0.0, 1.0, ..., 0.0, 66604.0, 31570.0],
       [0.0, 1.0, 0.0, ..., 0.0, 215883.0, 4001.0],
       [0.0, 0.0, 0.0, ..., 0.0, 248360.0, 12732.0]], dtype=object)

##### Option 2 : Filling the missing values with Scikit-Learn

In [45]:
car_sales_missing = pd.read_csv("/home/hp1/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [47]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [48]:
#drop the rows which have the missing price values
car_sales_missing.dropna(subset=["Price"],inplace=True)
len(car_sales_missing)

950

In [50]:
#we've lost some of the missing Doors, colours values because they might be overlapping with the missing Price column
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [51]:
#Split into X & y
X=car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [61]:
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [54]:
#fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#fill the categorical values with 'missing' and numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
numerical_imputer = SimpleImputer(strategy="mean")

In [58]:
#define columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
numerical_features = ["Odometer (KM)"]

In [59]:
#create an imputer (something that fills the missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("numerical_imputer",numerical_imputer ,numerical_features)

])

In [60]:
filled_X = imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [63]:
car_sales_filled = pd.DataFrame(filled_X,columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


In [64]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [70]:
#Now, that we dont have any missing values. Let's convert our dataframe into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categories_feature = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot,categories_feature)], remainder="passthrough")

transformed_X= transformer.fit_transform(car_sales_filled)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [72]:
#Now that we've got our data as numbers and it has no missing values. Let's fit the model
np.random.seed(13)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test = train_test_split(transformed_X,y,test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.24708595018025115