## SICKIT-LEARN

## end-end workflow of scikit learn

In [1]:
# 1. getting data ready
import pandas as pd
import numpy as np
data = pd.read_csv('heart.csv', on_bad_lines='skip')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [13]:
# create X (feature matrix)
X=data.drop('target',axis=1)

In [14]:
# create Y (lable matrix)
Y=data['target']


In [15]:
# 2. choose the right model and hyperparameters 
# hyperparameters are used to control the learning process by user
# to find out yes or no in this case we are going to use random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
# 3. fit the model to training data
# here we are splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3)


In [17]:
clf.fit(X_train,Y_train)

RandomForestClassifier()

In [26]:
# making a predection
y_pred=clf.predict(X_test)

In [22]:
# 4. Evaluating the mdoel
clf.score(X_train,Y_train)

1.0

In [25]:
clf.score(X_test,Y_test)

0.7912087912087912

In [27]:
# other ways to evaluate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
classification_report(Y_test,y_pred)


'              precision    recall  f1-score   support\n\n           0       0.84      0.71      0.77        45\n           1       0.75      0.87      0.81        46\n\n    accuracy                           0.79        91\n   macro avg       0.80      0.79      0.79        91\nweighted avg       0.80      0.79      0.79        91\n'

In [28]:
confusion_matrix(Y_test,y_pred)

array([[32, 13],
       [ 6, 40]], dtype=int64)

In [29]:
accuracy_score(Y_test,y_pred)

0.7912087912087912

In [31]:
# 5. improving the model
# try different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f'trying with model {i}')
    clf = RandomForestClassifier()
    clf.fit(X_train,Y_train)
    print(clf.score(X_test,Y_test))

trying with model 10
0.7692307692307693
trying with model 20
0.7582417582417582
trying with model 30
0.7692307692307693
trying with model 40
0.7582417582417582
trying with model 50
0.7582417582417582
trying with model 60
0.7912087912087912
trying with model 70
0.7912087912087912
trying with model 80
0.7692307692307693
trying with model 90
0.7802197802197802


In [2]:
# save a model and load it 
# this can be done by using pickle
import pickle
pickle.dump(clf,open("my model.pk1","wb"))

NameError: name 'clf' is not defined

In [36]:
loaded_modele = pickle.load(open("my model.pk1","rb"))
loaded_modele.score(X_test,Y_test)

0.7802197802197802

## GETTING OUR DATA READY

In [38]:
# T get the data ready we need to follow :
   # 1.split the data into features and labels(X,Y)
    #2. Filling (imputing)or disregarding missing data (cleaning data)
    #3. converting non-numercial values to numericla (feture encoding)
    

In [39]:
# converting data into numericals
car = pd.read_csv("scikit-learn-data/car-sales-extended.csv")

In [40]:
car

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [44]:
#splitting data
X = car.drop("Price",axis=1)
y=car['Price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) # returns error as the data is not converted into numericals
# here we need to convert the data into numerics


In [45]:
from sklearn.ensemble import RandomForestRegressor # this predicts the number
model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)


ValueError: could not convert string to float: 'Honda'

In [48]:
# converting into numerical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cat = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer =ColumnTransformer([("one_hot",one_hot,cat)],remainder = 'passthrough')
t_X = transformer.fit_transform(X)
t_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [67]:
# this can be done using pandas
car["Doors"] = car["Doors"].astype(object)
dummy = pd.get_dummies(car[['Make',"Colour",'Doors']])
dummy

  uniques = Index(uniques)


Unnamed: 0,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Doors_3,Doors_4,Doors_5
0,0,1,0,0,0,0,0,0,1,0,1,0
1,1,0,0,0,0,1,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,1,0,1,0
3,0,0,0,1,0,0,0,0,1,0,1,0
4,0,0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,1,1,0,0,0,0,0,1,0
996,0,0,1,0,0,0,0,0,1,1,0,0
997,0,0,1,0,0,1,0,0,0,0,1,0
998,0,1,0,0,0,0,0,0,1,0,1,0


In [70]:
# refitting the model
np.random.seed(444)
X_train,X_test,y_trian,y_test=train_test_split(t_X,y,test_size=0.2)
model.fit(X_train,y_train)


RandomForestRegressor()

In [71]:
model.score(X_test,y_test)

-0.21458355367124327

## FOR MISSONG VALUES

In [75]:
# 1.fill them with values(imputation)
#2. to remove that data
missing_data=pd.read_csv("scikit-learn-data/car-sales-extended-missing-data.csv")

In [76]:
missing_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [77]:
# NaN indicates not a number or undefined data
missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [78]:
x = missing_data.drop('Price',axis=1)
y = missing_data['Price']
x,y


(       Make Colour  Odometer (KM)  Doors
 0     Honda  White        35431.0    4.0
 1       BMW   Blue       192714.0    5.0
 2     Honda  White        84714.0    4.0
 3    Toyota  White       154365.0    4.0
 4    Nissan   Blue       181577.0    3.0
 ..      ...    ...            ...    ...
 995  Toyota  Black        35820.0    4.0
 996     NaN  White       155144.0    3.0
 997  Nissan   Blue        66604.0    4.0
 998   Honda  White       215883.0    4.0
 999  Toyota   Blue       248360.0    4.0
 
 [1000 rows x 4 columns],
 0      15323.0
 1      19943.0
 2      28343.0
 3      13434.0
 4      14043.0
         ...   
 995    32042.0
 996     5716.0
 997    31570.0
 998     4001.0
 999    12732.0
 Name: Price, Length: 1000, dtype: float64)

In [82]:
# fill missing data with pandas
missing_data['Make'].fillna("missing",inplace=True)
missing_data['Colour'].fillna("missing",inplace=True)
missing_data['Odometer (KM)'].fillna(missing_data['Odometer (KM)'].mean(),inplace=True)
missing_data['Doors'].fillna(4,inplace=True)

In [84]:
missing_data[200:210]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
200,missing,White,66461.0,3.0,25873.0
201,Honda,Red,95243.0,4.0,13559.0
202,Honda,Blue,84719.0,4.0,26402.0
203,Toyota,Blue,99761.0,4.0,10547.0
204,Nissan,Red,150563.0,4.0,16115.0
205,Toyota,Blue,154576.0,4.0,12553.0
206,Honda,White,211862.0,4.0,12476.0
207,Nissan,Black,125251.0,4.0,18306.0
208,Toyota,Blue,90856.0,4.0,33077.0
209,Honda,White,131253.237895,4.0,22746.0


In [85]:
missing_data.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [86]:
# remove rows with missing Price values
missing_data.dropna(inplace=True)

In [87]:
missing_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [89]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cat = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer =ColumnTransformer([("one_hot",one_hot,cat)],remainder = 'passthrough')
t_x = transformer.fit_transform(missing_data)
t_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [16]:
n_data = pd.read_csv('scikit-learn-data/car-sales-extended-missing-data.csv')
n_data.dropna()


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
994,BMW,Blue,163322.0,3.0,31666.0
995,Toyota,Black,35820.0,4.0,32042.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [17]:
n_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [44]:
n_data.dropna(subset=['Price'],inplace=True)
n_data.dropna(subset=['Colour'],inplace=True)# removing the nan labeled prices
n_data.dropna(subset=['Make'],inplace=True)
n_data['Odometer (KM)'].fillna(n_data['Odometer (KM)'].mean(),inplace=True)
n_data['Doors'].fillna(4,inplace=True)
n_data.isna().sum()


Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [45]:
#splitting into x,y
X= n_data.drop(['Price'],axis=1)
y = n_data['Price']

In [46]:
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0
...,...,...,...,...
994,BMW,Blue,163322.0,3.0
995,Toyota,Black,35820.0,4.0
997,Nissan,Blue,66604.0,4.0
998,Honda,White,215883.0,4.0


In [47]:
y
y.isna().sum()

0

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
model.fit(X_train,y_train)

ValueError: could not convert string to float: 'Toyota'

In [49]:
# covert
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cat = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer =ColumnTransformer([("one_hot",one_hot,cat)],remainder = 'passthrough')
t_X = transformer.fit_transform(X)
t_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [50]:
X_train,X_test,y_train,y_test = train_test_split(t_X,y,test_size=0.3)
model.fit(X_train,y_train)

RandomForestRegressor()

In [25]:
t_x

<950x16 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [52]:
model.score(X_test,y_test)

0.19204764024867582