# Never test your model on data that it has learned from

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

SUPERVISED LEARNING IN WHICH INPUT AND OUTPUT DATA IS PROVIDED AND THEN THE MODEL IS TRAINED ON THAT TO MAKE PREDICTIONS. EG: FAKE MAILS

In [None]:
# Create X (feature matrix)
X = heart_disease.drop("target", axis = 1)  # axis = 1 => column

# Create Y (labels)
Y = heart_disease["target"]
Y

In [None]:
# Features are the input variables or attributes that the machine learning model uses to make predictions.
# Each row in the dataset corresponds to an observation or data point, and the columns represent different features.
# Features are the characteristics or properties of the data that the model analyzes to learn patterns.
# For example, in a dataset of houses, the features might include square footage, number of bedrooms, and neighborhood.

X   # Feature   ----> Data on which the model will train
Y   # Label     ----> Result

# Labels, also known as the target variable or output variable, are the values that the machine learning model is trying to predict.
# The goal of the model is to learn a mapping from the features to the labels.
# In a supervised learning scenario, the dataset is typically labeled, meaning that it includes both the features and the corresponding correct labels.
# Using the house example, the label might be the price of the house.

In [None]:
# 2.  Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier   # RandomForest  ---->   Reduce Overfitting
clf = RandomForestClassifier()

# Hyperparameters are external configuration settings that influence the learning process but are not learned from the data.

# We'll keep the default hyperparameters
clf.get_params()

In [None]:
# fit the model to the training data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# test_size=0.2 means that 20% of the data will be used for testing, and 80% for training.

 X_train: This subset of the features is used for training the machine learning model.
 Y_train: The corresponding subset of labels/targets for the training set. Each row in X_train has a corresponding label in Y_train.

 X_test: This subset of the features is reserved for testing the trained model.
 Y_test: The corresponding subset of labels/targets for the testing set. Each row in X_test has a corresponding label in Y_test.

In [None]:
# Find the patterns in the traininig data
clf.fit(X_train, Y_train);

In [None]:
X_train

In [None]:
# Now , testing the model on X_test  OR  the MODEL will make predictions on the new data
# It will predicts the value of Y_test which is stored in the variable Y_preds on the basis of X_test

Y_preds = clf.predict(X_test)
Y_preds

In [None]:
Y_test

In [None]:
# 4. Evaluatae the model on the training data and test data

clf.score(X_train, Y_train)     # 1 is the max value of score      RANGE  ==>  [0,1]

# score is a method of the RandomForestClassifier class that calculates the accuracy of the model on a given dataset

In [None]:
# Test between the original data sets
clf.score(X_test, Y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(Y_test, Y_preds))

In [None]:
confusion_matrix(Y_test, Y_preds)

In [None]:
# Test between the predicted datasets
accuracy_score(Y_test, Y_preds)

In [None]:
#5. improve the model
# Try different amount of n_estimators
np.random.seed(42)

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accuracy on test set:{clf.score(X_test, Y_test) * 100:.2f}")
    print("")

In [None]:
# 6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forst_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forst_model_1.pkl", "rb"))
loaded_model.score(X_test, Y_test)

In [None]:
X = heart_disease.drop("target", axis = 1)  # feature
y = heart_disease["target"]                 # label

In [None]:
# Split the data into training and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Make sure the values are all numerical

In [None]:
car_sales = pd.read_csv("car-sales-extended.csv")
len(car_sales), car_sales.dtypes

In [None]:
car_sales.head()

In [None]:
# Split into X/y
X = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]

# Split into training and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder # convert categorical variables into a numerical format
from sklearn.compose import ColumnTransformer #  used to apply transformations selectively to different columns

categorical_features = ["Make", "Colour", "Doors"]        # Doors is c_f coz each door types has specific values
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough") # remainder="passthrough"' argument indicates that all other columns should be left unchanged
# "one_hot": This is a string label or name that you can use to refer to this specific transformation later in your code.
# one_hot: This is the actual instance of the OneHotEncoder class that you created earlier. This is the transformer that will be applied to the specified columns.
transformed_X = transformer.fit_transform(X) # transformed_X will work on the dataframe to change it into numbers
transformed_X

In [None]:
pd.DataFrame(transformed_X)

In [None]:
# Viewing the data
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]]) # [[]] When you want to select multiple columns from a DataFrame 
dummies.head()
# dummies  contains the original columns ("Make", "Colour", "Doors") replaced with their one-hot encoded representations.

In [None]:
# Let's refit the model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
np.random.seed(42) #used to give the same result again and again , coz one don't train the model on the same data again and again
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model.fit(X_train, y_train)
model.score(X_test, y_test)

## What if there is missing values

In [None]:
# import car sales missing data
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum() # effectively counting the number of True values along each column

In [None]:
# Fill missing datas with pandas

car_sales_missing["Make"].fillna("Missing", inplace= True)
car_sales_missing["Colour"].fillna("Missing", inplace= True)
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace= True)
car_sales_missing["Doors"].fillna(4, inplace = True)

In [None]:
# Checking the dataframe again
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.head()

In [None]:
# Removing rows with missing price values
car_sales_missing.dropna(axis=0, inplace = True)
car_sales_missing.isna().sum()

In [None]:
# Create X and y
X = car_sales_missing.drop("Price", axis = 1)  # ALREADY DROP KR DIA HAI UPAR
y = car_sales_missing["Price"]

In [None]:
# CONVERT THE DATA TO NUMBERS
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]        # Doors is c_f coz each door types has specific values
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")
# "one_hot": This is a string label or name that you can use to refer to this specific transformation later in your code.
# one_hot: This is the actual instance of the OneHotEncoder class that you created earlier. This is the transformer that will be applied to the specified columns.
transformed_X = transformer.fit_transform(car_sales_missing) 
transformed_X

# OPTION 2. FILL MISSING VALUES WITH SKL NOT WITH PANDAS

In [None]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Drop the Price label with no values in it
car_sales_missing.dropna(subset=["Price"], inplace = True)
car_sales_missing.isna().sum()  # Other columns values get changed coz some missing values were overlapping with the Price missing values

In [None]:
# Split the data into X and y
X = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

In [None]:
X.isna().sum()

In [None]:
# Fill Missing values (IMPUTATION) in scilearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill missing values with 'missing' & numerical values with 'mean'    of  COLUMNS
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")   
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")
# If "constant", then replace missing values with fill_value.
# fill_value : str or numerical value, default=None


# Define Columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]


# Create an imputer  (SOMETHING THAT FILLS MISSING DATA)  described above
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),       # "cat_imputer" is just a name which  can be accessed later but not here
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])


# Transform the data
filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X, columns = ["Make", "Colour","Doors", "Odometer (KM)"])
car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]        # Doors is c_f coz each door types has specific values
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")
# "one_hot": This is a string label or name that you can use to refer to this specific transformation later in your code.
# one_hot: This is the actual instance of the OneHotEncoder class that you created earlier. This is the transformer that will be applied to the specified columns.
transformed_X = transformer.fit_transform(car_sales_filled) # transformed_X will work on the dataframe to change it into numbers
transformed_X

In [None]:
# Now we've got our data in nos and filled the missing values
# Let's fit the model

np.random.seed(42)  # CHANGING SEED VALUES AFFECT THE MODEL
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## CHOOSING THE RIGHT ESTIMATOR / ALGORITHM

If you're working on a model and looking to use sklearn and not sure what model you should use then,,,,,
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html#choosing-the-right-estimator

In [None]:
### Picking a ml model for a regression problem

In [3]:
# Get California Housing dataset   (already in sklearn just need to import)
import pandas as pd
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [None]:
housing_df["target"] = housing["target"]
# housing_df = housing_df.drop("MedHouseVal", axis = 1)
housing_df

In [None]:
# Import algorithm/estimator
from sklearn.ensemble import RandomForestRegressor  # is better coz uses decision trees
# from sklearn.linear_model import Ridge            # thik hai

# Setup random seed
np.random.seed(42)

# Create the data
X = housing_df.drop("target", axis = 1)
y = housing_df["target"] # median house price id $ 100,000s

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate and fit the model (in the trainig data)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# check the score of the model 
model.score(X_test, y_test)

In [None]:
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

### Consulting the map and it says try `LinearSVC` # (used for both classification and regression)

In [None]:
# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC                        

# Setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate linearSVC
model = LinearSVC(max_iter=10000)
model.fit(X_train, y_train)

# Evaluate the model 
model.score(X_test, y_test)

In [None]:
heart_disease["target"].value_counts()

In [None]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model 
model.score(X_test, y_test)

# Make predictions using a ML model

2 ways to make predictions:

    1.predict()
    2.predict_proba()

In [None]:
X_test.head()

In [None]:
model.predict(X_test)

In [None]:
np.array(y_test)

In [None]:
# Compare predictions to truth table to evaluate model
y_preds = model.predict(X_test)    #  X_test dia hai to y_preds me y_test ki value hai
np.mean(y_preds == y_test)

In [None]:
model.score(X_test, y_test)

In [None]:
# One more way
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

In [None]:
# predict_proba() returns probabilities of a classification label
model.predict_proba(X_test[:5])
# It returns the probabilities whether it's a 1 or 0 
# col 1 represent 0 and col 2 rep 1
# 0.89 means first is 0 check below it is 0

In [None]:
# Let's predict on the same data
model.predict(X_test[:5])

`predict()` can also be used for regression models

In [None]:
housing_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
X =  housing_df.drop("target", axis =1)
y =  housing_df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)  #  X_test dia hai to y_preds me y_test ki value hai

In [None]:
y_preds[:10]

In [None]:
np.array(y_test[:10])

In [None]:
len(y_preds), len(y_preds)

In [None]:
model.score(X_test, y_test)

In [None]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

In [None]:
housing_df["target"]

In [None]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
model = RandomForestClassifier(n_estimators=50)
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

# Evaluating a model using scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train);

In [None]:
model.score(X_test, y_test)

In [None]:
cross_val_score(model, X, y, cv=5)    #  Cross validation  5 = default

In [None]:
cross_val_score(model, X, y, cv=10)  

In [None]:
np.random.seed(42)

# Single training and test split score
model_single_score = model.score(X_test, y_test)

# Take the mean of 5-fold-cross-validation score
model_cross_val_score = np.mean(cross_val_score(model, X, y, cv = 5))

# Compare the data
model_single_score,  model_cross_val_score

In [None]:
# Default scoring parameter of classifier = mean accuracy
model.score()

In [None]:
# Scoring parameter set to None by default
cross_val_score(model, X, y, scoring = None)  # if scoring is not here then model is set to default

### Classification model evaluation metrics

1. Accuracy
2. Area under ROC curver
3. Confusion Matrix
4. Classification report
 
  **Accuracy**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]
model = RandomForestClassifier(n_estimators=100)
cross_val_score=cross_val_score(model, X, y, cv=5)

In [None]:
np.mean(cross_val_score)

In [None]:
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score) * 100:.2f}%")

**Area Under Curve (AUC) / ROC**

ROC Curves are a comparison of a model's true positive rate (tpr) versus a models false positive rate (fpr)

* True positive = model predicts 1 when truth is 1 (target =1 )
* False positive = model predicts 1 when truth is 0 (target = 0)
* True negative = model predicts 0 when truth is 0
* False negative = model predicts 0 when truth is 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

# Fit the classifier
model.fit(X_train, y_train)

# Make predictions with probabilities
y_probs = model.predict_proba(X_test)

y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:,1]
y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

# Check the false positive rates
fpr

In [None]:
# Create a function for plotting ROC curve
import matplotlib.pyplot as fuck

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) 
    and true positive rate (tpr) of a model.
    """
    # Plot the ROC curve
    fuck.plot (fpr, tpr, color= "orange", label= "ROC")
    # Plot line with no prediction power (baseline)
    fuck.plot([0, 1], [0, 1], color= "darkblue", linestyle="--", label = "Guessing")

    # Customize the plot
    fuck.xlabel("False positive rate (fpr)")
    fuck.ylabel("True positive rate (tpr)")
    fuck.title("Receiver Operating Characteristic (ROC) Curve")
    fuck.legend()
    fuck.show()

plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

In [None]:
# Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC score
roc_auc_score(y_test, y_test)

**Confusion Matirx**

  A confusion matrix is a quick way to compare the labels a model predicts and the actual it was supposed to predict.
  
  In essence, giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = model.predict(X_test)

confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab
pd.crosstab(y_test, y_preds, rownames = ["Actual Labels"], colnames = ["Predicted Labels"])

In [None]:
22+7+8+24

In [None]:
len(X_test)

In [None]:
# Make our confusion matrix more visual with Seaborn's heatmap()
import seaborn as sns

# Set the front scale
sns.set(font_scale = 1.5)

# Create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# Plot it using Seaborn
sns.heatmap(conf_mat)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=model, X=X, y=y)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred = y_preds);

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # model predicts every case as 0

disease_preds = np.zeros(10000) # model predicts every case as 0

pd.DataFrame(classification_report(disease_true, disease_preds, output_dict=True))

### Regression model evaluation matrix

Model Evaluation metrics -  https://scikit-learn.org/stable/modules/model_evaluation.html

The ones we're going to cover are :
1. R^2  or  coefficient of determination
2. Mean absolute error (MAE)
3. Mean squared error (MSE)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df["target"] = housing["target"]
# housing_df = housing_df.drop("MedHouseVal", axis = 1)
housing_df
np.random.seed(42)

X = housing_df.drop("target", axis = 1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

model = RandomForestRegressor(n_estimators = 100)
model.fit(X_train,y_train)

In [None]:
housing_df.head()

In [None]:
from sklearn.metrics import r2_score

# Fill the array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_test_mean[:10]

In [None]:
r2_score(y_true=y_test, y_pred = y_test_mean)

In [None]:
r2_score(y_true=y_test, y_pred = y_test)

**Mean Absolute Error (MAE)**

MAE is the average of the absolute differences between predictions and actual values.

It gives you an idea of how wrong your models predictions are.

In [None]:
# MAE
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

In [None]:
y_preds

### Finally using the scoring Parameter

In [None]:
heart_disease = pd.read_csv("heart-disease.csv")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis =1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators=1000)

In [None]:
np.random.seed(42)

# Cross-validation score
cv_acc = cross_val_score(clf, X, y, cv=5, scoring=None)
# if scoring = Noen estimators default scoring evaluation metric us used which is accuracy for classification models
cv_acc

In [None]:
# Cross validated accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
# ACCURACY
np.random.seed(42)
# Cross-validation score
cv_acc = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
cv_acc

In [None]:
# Cross validated accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
# PRECISION
np.random.seed(42)
# Cross-validation score
cv_acc = cross_val_score(clf, X, y, cv=5, scoring="precision")
cv_acc

In [None]:
# Cross validated accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
# RECALL
np.random.seed(42)
cv_recall = cross_val_score(clf, X, y, cv=5, scoring="recall")
cv_recall

In [None]:
# Cross validated accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

Let's see the `scoring` parameter for regression problem

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("target", axis =1)
y = housing_df["target"]

model = RandomForestRegressor(n_estimators=100)

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=3, scoring=None)
np.mean(cv_r2)

In [None]:
cv_r2

In [None]:
# Mean squared error
cv_mse = cross_val_score(model, X, y, cv=3, scoring = "neg_mean_squared_error")
np.mean(cv_mse)

In [None]:
# Mean absolute error
cv_mae = cross_val_score(model, X, y, cv=3, scoring = "absolute_error")
np.mean(cv_mae)

## Using different evaluaation metrics as Scikit-Learn functions

In [None]:
heart_disease = pd.read_csv("heart-disease.csv")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis =1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate model using evaluation functions
print("classifier metrics on the test set")
print(f"Accuracy: {accuracy_score(y_test, model.predict(X_test))*100:.2f}%")
print(f"Precision: {precision_score(y_test, model.predict(X_test))*100:.2f}%")
print(f"Recall: {recall_score(y_test, model.predict(X_test))*100:.2f}%")
print(f"F1: {f1_score(y_test, model.predict(X_test))*100:.2f}%")

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)
X = housing_df.drop("target", axis =1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

print("Regression metrics on the test set")
print(f"R2 score: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

## Improving a model

First predictions = baseline predictions.
First model = baseline model.

From a data perspective:
* Could we collect more data? (generally, the more data more better)
* Could we improve our data?

  From a model perspective:
* Is there a better model we could use?
* Could we improve the current model?

Parameters = model find patterns in data.
Hyperparameters = settings on a model you can adjust to (potentially) improve its liability to find patterns.

3 ways to adjust hyperparameters
1. By hand
2. Randomly with RandomSearchCV
3. Exhaustively with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.get_params()

The parameters we are going to adjust

* `max_depth`
* `max_features`
* `min_samples_leaf`
* `min_samples_split`
* `n_estimators`

In [None]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs y_preds labels
    on a classifiacation model.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dist = {"accuracy": round(accuracy, 2), "precision": round (precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc:{accuracy * 100:.2f}%")
    print(f"Precision:{precision * 100:.2f}%")
    print(f"Recall:{recall * 100:.2f}%")
    print(f"F1:{f1 * 100:.2f}%")

    return metric_dist

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Shuffle the data
heart_disease_shuffled = heart_disease.sample(frac=1)

# Split into X and y
X = heart_disease_shuffled.drop("target", axis =1)
y = heart_disease_shuffled["target"]

# Split the data into train, varidation & test sets
train_split = round(0.7* len(heart_disease_shuffled))  # 70% of data
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # 15 % of data
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split:]

# len(X_train), len(X_valid), len(X_test)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make baseline predictions
y_preds = clf.predict(X_valid)

# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

In [None]:
np.random.seed(42)

# Create a second classifier with different hyperparameters
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

# Make predictions 
y_preds_2 = clf_2.predict(X_valid)

# Evaluate the second classifier
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)


In [None]:
clf_3 = RandomForestClassifier(n_estimators=100, max_depth=10)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"n_estimators":[10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split":[2, 4, 6],
        "min_samples_leaf":[1, 2, 4]}

np.random.seed(42)

# Split into X and y
X = heart_disease_shuffled.drop("target", axis = 1)
y = heart_disease_shuffled['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1) # -1 = all processors

# setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions = grid,
                            n_iter = 10,
                            cv=5,
                            verbose=2)
# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);

In [None]:
rs_clf.best_params_

In [None]:
# Make predictions with the best hyperparameters
rs_y_preds = rs_clf.predict(X_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

## Hyperparametes tuning with GridSearchCV

In [None]:
grid

In [None]:
3*1*2*1*2*5     # parameters

In [None]:
grid_2 = {'n_estimators': [100, 200, 500],
         'max_depth': [None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_split': [6],
         'min_samples_leaf': [1, 2]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# Split into X and y
X = heart_disease_shuffled.drop("target", axis = 1)
y = heart_disease_shuffled['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1) # -1 = all processors

# setup GridSearchCV
gs_clf = GridSearchCV(estimator=clf,
                            param_grid = grid_2,
                            cv=5,
                            verbose=2)
# Fit the GridSearchCV version of clf
gs_clf.fit(X_train, y_train);

In [None]:
gs_y_preds = gs_clf.predict(X_test)

# Evaluate the predictions
gs_metrics = evaluate_preds(y_test, gs_y_preds)

Let's compare our different models metrics


In [None]:
compare_metrics = pd.DataFrame({"baseline": baseline_metrics,
                               "clf_2": clf_2_metrics,
                                "random search": rs_metrics,
                                "grid search": gs_metrics})

compare_metrics.plot.bar(figsize=(10, 8));

## Saving and loading machine learning models

Two ways:
* With python's Pickle module
* With the joblib module/
  

In [None]:
import pickle

# Save an existing model to file
pickle.dump(gs_clf, open("gs_random_forest_model_1.pkl", "wb"))

In [None]:
# Load a saved model
Loaded_pickle_model = pickle.load(open("gs_random_random_forest_model_1.pkl", "rb"))


In [None]:
# Make some predictions
pickle_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)

## Joblib

In [None]:
from joblib import dump, load

# Save model to file
dump(gs_clf, filename=gs_random_forest_model_1.joblib)

In [None]:
# import a saved joblib model
loaded_job_model = load(filename = "gs_random_forest_model_1.joblib")

In [None]:
# Make and evaluate joblib model
joblib_y_preds = loaded_joblib_model.predict(X_test)
evaluate_preds(y_test, joblib_y_preds)