# Introduction to Scikit learn

In [30]:
# 1. GET THE DATA READY
import pandas as pd
import numpy as np
heart_disease=pd.read_csv("heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [31]:
# Create X (Features Matrix)
X=heart_disease.drop("target",axis=1)
# Create y(labels)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [32]:
# Create y l(Labels)
y = heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [33]:
# 2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
# We'll keep the default hyperparameters
# clf.get_params()


In [34]:
# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)


In [35]:
clf.fit(X_train,y_train)

In [36]:
y_preds=clf.predict(X_test)
y_preds

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [37]:
# Evaluation the model on the training data and test data
clf.score(X_train, y_train)

1.0

In [38]:
clf.score(X_test, y_test)

0.8852459016393442

In [39]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88        30
           1       0.88      0.90      0.89        31

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.89        61
weighted avg       0.89      0.89      0.89        61



In [40]:
confusion_matrix(y_test,y_preds)

array([[26,  4],
       [ 3, 28]], dtype=int64)

In [41]:
accuracy_score(y_test,y_preds)

0.8852459016393442

In [42]:
# 5 Improve a model
#Try different amout of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators...")
    clf=RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set : {clf.score(X_test,y_test)*100}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set : 78.68852459016394%

Trying model with 20 estimators...
Model accuracy on test set : 85.24590163934425%

Trying model with 30 estimators...
Model accuracy on test set : 81.9672131147541%

Trying model with 40 estimators...
Model accuracy on test set : 86.88524590163934%

Trying model with 50 estimators...
Model accuracy on test set : 83.60655737704919%

Trying model with 60 estimators...
Model accuracy on test set : 83.60655737704919%

Trying model with 70 estimators...
Model accuracy on test set : 81.9672131147541%

Trying model with 80 estimators...
Model accuracy on test set : 86.88524590163934%

Trying model with 90 estimators...
Model accuracy on test set : 80.32786885245902%



In [43]:
# 6. Save a model and load it
import pickle
pickle.dump(clf,"

SyntaxError: unterminated string literal (detected at line 3) (1852837335.py, line 3)

In [None]:
## 1. Getting our data ready to be used with machine learning


In [None]:
heart_disease.head()

In [None]:
X=heart_disease.drop("target",axis=1)
X

In [None]:
y=heart_disease["target"]
y.head()

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 1.1 Make Sure it's all numerical

In [None]:
car_sales=pd.read_csv("car-sales-extended.csv")
car_sales

In [None]:
car_sales.isna()

In [None]:
car_sales.head()

In [None]:
# Split into X/y
X=car_sales.drop("Price",axis=1)
y=car_sales["Price"]

# Split into training and test
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [None]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test, y_test)

# Turn the categories into numbers


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features=["Make", "Colour", "Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",one_hot,categorical_features)],remainder="passthrough")
transformer


In [None]:
transformed_x=transformer.fit_transform(X)
transformed_x

In [None]:
pd.DataFrame(transformed_x)

In [None]:
# Let's refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test=train_test_split(transformed_x,y,test_size=0.2)
model.fit(X_train, y_train)

In [None]:
model.score(X_test,y_test)
        

### 1.2 What if the were missing values ?
1. Fill them with some value
2 Remove the samples with missing data altogether

In [45]:
# Import car sales missing data
car_sales_missing=pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [49]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [50]:
# Create X & y
X=car_sales_missing.drop("Price",axis=1)
y=car_sales_missing["Price"]

In [54]:
car_sales_missing.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

### Option 1: Fill missing data with pandas

In [55]:
# Fill the "Make" Column
car_sales_missing["Make"].fillna("missing",inplace=True)

# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing",inplace=True)

#Fill the "Odometer" Column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(),inplace=True)

# Fill the door
car_sales_missing["Doors"].fillna(4,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_sales_missing["Doors"].fillna(4,inplace=True)


In [56]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [57]:
# Remove rows with missing Price Values
car_sales_missing.dropna(inplace=True)


In [59]:
# Split into X/y
X=car_sales_missing.drop("Price",axis=1)
y=car_sales_missing["Price"]


In [60]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features=["Make", "Colour", "Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",one_hot,categorical_features)],remainder="passthrough")
transformer


In [61]:
transformed_x=transformer.fit_transform(X)
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [63]:
## Option 2: Fill Missing values with scikit-learn
car_sales_missing=pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [64]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [66]:
car_sales_missing.dropna(subset=["Price"],inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [67]:
# Split into Xx & Y
X = car_sales_missing.drop("Price",axis=1)
y=car_sales_missing["Price"]

In [72]:
# Fill missing values scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill Categorical valuess with "missing" & "numerical value with mean"
cat_imputer=SimpleImputer(strategy="constant",fill_value="missing")
door_imputer=SimpleImputer(strategy="constant",fill_value=4)
num_imputer=SimpleImputer(strategy="mean")

# Define columns
cat_features=["Make","Colour"]
door_feature=["Doors"]
num_features=["Odometer (KM)"]

#Create an imputer (Something that fills missing data)
imputer=ColumnTransformer([
    ("cat_imputer",cat_imputer,cat_features),
    ("door_imputer",door_imputer,door_feature),
    ("num_imputer",num_imputer,num_features)
])
imputer
# Transform the data
#filled_X=imputer.fit_transform(X)
#filled_X


In [70]:
 car_sales_filled=pd.DataFrame(filled_X,columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [71]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

# 2.1 Picking a machine learning model for a regression problem

In [80]:
# Import Boston housing dataset
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [88]:
len(heart_disease)

303

In [103]:
heart_disease=pd.read_csv("heart-disease.csv")
# Imporrt the LinearSVC estimator class
from sklearn.svm import LinearSVC
# Setup random seed
np.random.seed(42)

# Make the data
X=heart_disease.drop("target",axis=1)
y=heart_disease["target"]

#Split the data


In [101]:
# Import the LinearSvc estimator class
from sklearn.svm import LinearSVC

# Setup random seed
np.random.seed(42)

# Make the data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

# Instantiate LinearSvc  
clf=LinearSVC(max_iter=10000)
clf.fit(X_train,y_train)

#Evalueate The LinearSVC
clf.score(X_test, y_test)

0.8688524590163934

In [107]:
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)
X=heart_disease.drop("target",axis=1)
y=heart_disease["target"]
# Split the data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

clf=RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


0.8524590163934426

# Fit the model

In [108]:
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [109]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [115]:
# Compare predictions to truth labels to evaluate the model

0.8524590163934426

In [114]:
clf.score(X_test, y_test)

0.8524590163934426

In [116]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

In [117]:
# Predict_proba()
clf.predict_proba(X_test[:5])

array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [122]:
heart_disease.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

# 4. Evaluating a machine learning model

In [125]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
np.random.seed(42)
X=heart_disease.drop("target",axis=1)
y=heart_disease["target"]
# Split the data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

clf=RandomForestClassifier()
clf.fit(X_train, y_train)
# clf.score(X_test, y_test)
cross_val_score(clf,X,y,cv=5)

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])

In [None]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
X=heart_disease.drop("target",axis=1)
y=heart_disease["target"]
# Split the data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)
model=RandomForestReg