In [51]:
# Standard Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### NOTE: 
Always seperate out your test and training sets before doing any changes to the dataset 
like removing na or anything. Do filling/removing ONLY on the training set

# 1. Ready Your Data

### Three main things: 
* Split the data into features and labels 
* Split data into training and test sets
* Converting non numerical values to numerical (like Price)
* Filling (imputing) or disregarding missing values.

In [52]:
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Split the Data into FEATURES and LABELS

In [53]:
# Every single column except target
# Because target is what we have to predict 
# and other columns will be USED to predict target
# so seperate them out

X = heart_disease.drop("target", axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [54]:
y = heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

## Split the data into TEST and TRAINING SETS

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Converting Non Numerical to Numerical

In [56]:
car_sales = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended.csv")
car_sales

In [57]:
# Split X/y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [58]:
# Categories into Numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# KISKO CONVERT KARNA HAI ? Categories ko 
# Doors because it is categorical since 4 doors waali 856 cars hai, 5 doors waali 79, and 3: 65
# We could categorise it !
categorical_features = ["Make", "Colour", "Doors"]

# Instantiate OneHotEncoder (copy paste if you dont get it)
one_hot = OneHotEncoder()

# creating a transformer (copy paste, just change categorical features)
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                  remainder="passthrough")

# convert into numbers
transformed_X = transformer.fit_transform(X)
transformed_X

In [59]:
pd.DataFrame(transformed_X)

In [60]:
# ANOTHER WAY
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

## Missing Values in Dataset
* Fill them with some value (imputation)
* Remove samples with missing data

In [61]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()
car_sales_missing

In [62]:
# Create X/y

X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [63]:
## Convert data to numbers
# Thus, categorise and convert them to numbers just like before

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

### Fill Missing Data using pd

In [64]:
# Fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

# Fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace=True)

In [65]:
car_sales_missing.isna().sum()
# we can see all the nan features(X) have been removed
# but we also have nan labels(y) , but since they are output,
# if we dont have output how can we, at this point predict.

# So we will Remove the column with no y

### Remove Empty Label list Row using pd

In [66]:
car_sales_missing.dropna(inplace=True)

In [67]:
car_sales_missing.isna().sum() , len(car_sales_missing)
# Now no missing values !
# But we lost 50 samples

In [68]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [69]:
## Convert data to numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")

transformed_X = transformer.fit_transform(X)
print(transformed_X)

## EXTRA: Check out Feature Scaling: 
In other words, making sure all of your numerical data is on the same scale.

For example, say you were trying to predict the sale price of cars and the number of kilometres on their odometers varies from 6,000 to 345,000 but the median previous repair cost varies from 100 to 1,700. A machine learning algorithm may have trouble finding patterns in these wide-ranging variables.

To fix this, there are two main types of feature scaling.

Normalization (also called min-max scaling) - This rescales all the numerical values to between 0 and 1, with the lowest value being close to 0 and the highest previous value being close to 1. Scikit-Learn provides functionality for this in the MinMaxScalar class.

Standardization - This subtracts the mean value from all of the features (so the resulting features have 0 mean). It then scales the features to unit variance (by dividing the feature by the standard deviation). Scikit-Learn provides functionality for this in the StandardScalar class.

### Filling and transforming using sklearn

In [70]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

In [71]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

In [72]:
# Split into X & y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [73]:
# Split data into train and test
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)
X.isna().sum()

In [74]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing") # Categorical Imputer
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean") # Numerical Imputer

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train) # since training set: using fit_transform
filled_X_test = imputer.transform(X_test) # since test set: only transform

# Check filled X_train
filled_X_train

In [75]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

# Check missing data in training set
car_sales_filled_train.isna().sum()

In [76]:
# Convert the dataframes into numbers as before:
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately

# fit_transform on training data
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
# only transform on test data
transformed_X_test = transformer.transform(car_sales_filled_test)

# Check transformed and filled X_train
transformed_X_train.toarray()

# 2. Choosing an Estimator

Some things to note:

* Sklearn refers to machine learning models, algorithms as estimators.
* Classification problem - predicting a category (heart disease or not)
    * Sometimes you'll see clf (short for classifier) used as a classification estimator
* Regression problem - predicting a number (selling price of a car)

If you're working on a machine learning problem and looking to use Sklearn and not sure what model you should use, refer to the sklearn machine learning map: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

1. If you have structured data: like Tables or Dataframes, prefer Ensemble Methods Like RandomForestClassifier/Regressor
  
2. If you have unstructured data, prefer Deep Learning or Transfer Learning Methods

### Regression Problem: Picking an estimator
Let's use the California Housing Dataset

In [77]:
 # Getting it, since its inbuilt: 
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [78]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

In [79]:
# what we hqave to predict ? target: so make like an output column

housing_df["target"] = housing["target"]
housing_df

So what do we have to do? Predict the value of target by using the other feature columns

#### Step 1: Ready the Data: 

In [80]:
# 1. Split in features and labels

X = housing_df.drop("target", axis=1)
y = housing_df["target"] # which is: median houseprice in $100,000 (read docs)

# Setup Random Seed: 
np.random.seed(42)

# 2. Split the dataset into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. Fill missing data: But no missing data is there: housing_df.isna().sum()

#### Step 2: Choose an Estimator

##### Refer to the chart: 
Start -> Samples>50: Yes -> Categorical? No -> Predicting a quantity: Yes -> Samples>100k? No -> few features should be important: DONT KNOW: SO DO BOTH ALGOS  
    
RidgeRegression, if not work then Ensemble Regression  
OR   
Elastic Net

In [81]:
# 1. Split in features and labels

X = housing_df.drop("target", axis=1)
y = housing_df["target"] # which is: median houseprice in $100,000 (read docs)

# Setup Random Seed: 
np.random.seed(42)

# 2. Split the dataset into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. Fill missing data: But no missing data is there: housing_df.isna().sum()
# ---------------------------------------------------------------------------------

# import estimator: RidgeRegression
from sklearn.linear_model import Ridge

# Setup Random Seed: 
np.random.seed(42)

# Instantiate and fit the model
model = Ridge()
model.fit(X_train, y_train)

# Check the score of the model (TEST SET)
model.score(X_test, y_test)

In [82]:
# Standard Imports
​
import numpy as np# Standard Imports
​
import numpy as np# 1. Split in features and labels

X = housing_df.drop("target", axis=1)
y = housing_df["target"] # which is: median houseprice in $100,000 (read docs)

# Setup Random Seed: 
np.random.seed(42)

# 2. Split the dataset into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. Fill missing data: But no missing data is there: housing_df.isna().sum()
# ---------------------------------------------------------------------------------

# import Estimator: Lasso
from sklearn import linear_model

# Setup Random Seed: 
np.random.seed(42)

model = linear_model.Lasso()
model.fit(X_train, y_train)

# Check the score of the model (TEST SET)
model.score(X_test, y_test)

In [None]:
# 1. Split in features and labels

X = housing_df.drop("target", axis=1)
y = housing_df["target"] # which is: median houseprice in $100,000 (read docs)

# Setup Random Seed: 
np.random.seed(42)

# 2. Split the dataset into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. Fill missing data: But no missing data is there: housing_df.isna().sum()
# ---------------------------------------------------------------------------------

# ENSEBLE Algorithm: SVR

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Setup Random Seed: 
np.random.seed(42)

model = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model.fit(X_train, y_train)

# Check the score of the model (TEST SET)
model.score(X_test, y_test)

In [None]:
# 1. Split in features and labels

X = housing_df.drop("target", axis=1)
y = housing_df["target"] # which is: median houseprice in $100,000 (read docs)

# Setup Random Seed: 
np.random.seed(42)

# 2. Split the dataset into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. Fill missing data: But no missing data is there: housing_df.isna().sum()
# ---------------------------------------------------------------------------------

# ENSEMBLE Algorithm: RandomForestRegressor: Ensemble Regressor Algorithm

from sklearn.ensemble import RandomForestRegressor

# Setup Random Seed: 
np.random.seed(42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

model.score(X_test, y_test)

# This works the best !

### Classification Problem: Choosing an Estimator

In [None]:
# Heart Disease was a classification problem 
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

Refer the cheatsheet, it leads us to: LinearSVC


In [None]:
# Setup random Seed
np.random.seed(42)

# Make the data: 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into training and test datasets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Import: 
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Setup random Seed
np.random.seed(42)

# Make the data: 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into training and test datasets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Import: 
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

model.score(X_test, y_test)

# 3. Fit the estimator to make predictions

2 ways to make predictions: 
1. `predict()` 
2. `predict_proba()`

In [None]:
# Setup random Seed
np.random.seed(42)

# Make the data: 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into training and test datasets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Import: 
from sklearn.svm import LinearSVC

# Instantiate The model
model = LinearSVC()

# Fit the model
model.fit(X_train, y_train)

# Test accuracy
model.score(X_test, y_test)

# Make predictions using the model
model.predict(X_test)

In [None]:
# we will compare the predictions with the truth: y_test: 
np.array(y_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = model.predict(X_test)
np.mean(y_preds == y_test)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

`predict_proba()`   
predict_proba() returns probabilities of a classification label 

In [None]:
# SVC ke liye alag hota hai idk why 
model._predict_proba_lr(X_test[:5])

In [None]:
model.predict(X_test[:5])

it how certain your model is giving the predictions. we want it to be very confident.  
0 aane ki 0.61 probability hai(61%)  
1 aane ki 68%

### `predict()` for our regression problem 

In [83]:
housing_df

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create X y
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split into training and test datasets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
model = RandomForestRegressor()

# Fit the model
model.fit(X_train, y_train)

# Test its accuracy
model.score(X_test, y_test)

# prediction
model.predict(X_test)

# 4. Evaluating a model
<br>

### Classification Model:
<br>
        
1. Estimator's built in: score() method  
2. The scoring parameter  
3. Problem specific evaluation metrics: Classification Report
<br>
 <br>
### Regression Model: 
<br>
1. R^2 or coefficient of determination  <br>
2. Mean Absolute Error (MAE)   <br>
3. Mean Squared Error (MSE)   <br>
4. Scoring Parameter here also  <br> <br>

## Classification Model

### Score() Method

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

### scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train);

In [None]:
np.random.seed(42)
model.score(X_test,y_test)

#### Accuracy, Precision, Recall, F1

In [None]:
np.random.seed(42)
cross_val_acc = cross_val_score(model, X,y, scoring=None)
cross_val_acc
# if scoring=None, 
# esitmator's default scoring evaulation metric is used (accuracy for classification models)
# Matlab for classification models: it uses scoring="accuracy"

In [None]:
# Finding out the mean of the 5 scores found
np.random.seed(42)
model_cross_val_score = np.mean(cross_val_acc)
model_cross_val_score

#### Precision

In [None]:
np.random.seed(42)
cross_val_prec = cross_val_score(model, X,y, scoring="precision")
cross_val_prec
print(f"The cross-validated precision is: {np.mean(cross_val_prec)}")

#### Recall

In [None]:
# Recall
np.random.seed(42)
cv_recall = cross_val_score(model, X, y, cv=5, scoring="recall")
print(f"The cross-validated recall is: {np.mean(cv_recall)}")

### Evaluation Metrics

1. Accuracy, Precision, Recall, F1
2. Area under ROC curve
3. Confusion Matrix
4. Classification Report

#### Accuracy 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create X & y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model
model = RandomForestClassifier()

# Fit model
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)

# Evaluate model using evaluation functions
print("Classifier metrics on the test set")
print(f"Accurracy: {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision: {precision_score(y_test, y_preds)}")
print(f"Recall: {recall_score(y_test, y_preds)}")
print(f"F1: {f1_score(y_test, y_preds)}")

In [None]:
print(f"Heart Disease Classifier has a Cross-Validated Accuracy of {model_cross_val_score*100:.2f}%")

#### Area under the Reciever Operating Characteristic Curve (AUC/ROC Curve)
#### BINARY CLASSIFICATION PROBLEMS ONLY !


ROC curves are a comparison of a model's true postive rate (tpr) versus a models false positive rate (fpr).  

* True positive = model predicts 1 when truth is 1  
* False positive = model predicts 1 when truth is 0 
* True negative = model predicts 0 when truth is 0  
* False negative = model predicts 0 when truth is 1


In [None]:
from sklearn.metrics import roc_curve

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train);

y_probs = model.predict_proba(X_test)
y_probs[:10]

In [None]:
y_probs_positive = y_probs[:, 1:]
y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr, thresholds: 
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

# Check positive rates: 
fpr

In [None]:
# Create a function to plot roc_curve so it makes more sense
%matplotlib inline
import matplotlib.pyplot as plt
# Create a Figure and Axes object
fig, ax = plt.subplots()

# Plot roc curve
ax.plot(fpr, tpr, color="orange", label="ROC")

# Plot line with no predictive power (baseline)
ax.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing")

# Customize the plot
ax.set(xlabel="False positive rate (fpr)", 
       ylabel="True positive rate (tpr)", 
       title="Receiver Operating Characteristic (ROC) Curve")
ax.legend()

In [None]:
# Get the auc score: 
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

#### Confusion Matrix
  
A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict. In essence, giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = model.predict(X_test)
confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix: 
pd.crosstab(y_test, 
            y_preds, 
            rownames=["Actual Label"], 
            colnames=["Predicted Label"])

So we can see that our model is predicting 0 as 0 (correct) 24 times BUT it is predicting 1 when actual value was 0 (wrong)  
Diagonals 5,4 are times our model was wrong (was getting confused)  
<br>
    
SO 24,28 are giving us true negatives and true positives,  
WHILE 5,4 are giving us false negatives and false positives


In [None]:
# Another Visualisation of Confusion Matrix
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=model, X=X, y=y);

# OR YOU CAN DO: 

y_preds = model.predict(X_test)

#### Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

<img src="./Images/classification-report.png" alt="Alternative text" />

## Regression Model

### R^2 or coefficient of determination

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create X y
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split into training and test datasets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
model = RandomForestRegressor()

# Fit the model
model.fit(X_train, y_train)

# SCORE USUS R^2 ITSELF !
model.score(X_test, y_test)

### Mean Absolute Error (MAE)
<br>
MAE is the average of the absolute differences between predictions and actual values.

It gives you an idea of how wrong your models predictions are.

In [None]:
# MAE
from sklearn.metrics import mean_absolute_error
np.random.seed(42)

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

# This means OUR MODELS values are +/-0.32 from the actual value. It is OFF by 0.32
# You can say its the mean of the absolute differences btw actual and predicted values

In [None]:
df = pd.DataFrame(data={"actual values": y_test,
                        "predicted values": y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df.head(10)

In [None]:
# MAE using formulas and differences
np.random.seed(42)
np.abs(df["differences"]).mean()
# Absolute of the differences: and its mean

### Mean Squared Error: MSE
<br>
MSE is the mean of the square of the errors between actual and predicted values.

In [None]:
# Mean squared error
from sklearn.metrics import mean_squared_error

np.random.seed(42)
y_preds = model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
df["squared_differences"] = np.square(df["differences"])
df.head()

In [None]:
# Calculate MSE by hand/formula
np.random.seed(42)
squared = np.square(df["differences"])
squared.mean()

# This means OUR MODELS values are +/-0.25 from the actual value. It is OFF by 0.25
# You can say its the mean of the absolute differences btw actual and predicted values

### Scoring Paramter for Regression

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=3, scoring=None)
np.mean(cv_r2)

In [None]:
# Mean squared error
np.random.seed(42)
cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
np.mean(cv_mse)

In [None]:
# Mean absolute error
cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
np.mean(cv_mae)

<img src="./Images/regression-evaluation.png" alt="Alternative text" />

# Choosing Of all: 

<img src="./Images/Evaluation/1.png" alt="Alternative text" />

<img src="./Images/Evaluation/2.png" alt="Alternative text" />

<img src="./Images/Evaluation/3.png" alt="Alternative text" />

<img src="./Images/Evaluation/4.png" alt="Alternative text" />

# 5. Improving a model 

First predictions = baseline predictions.
First model = baseline model.

From a data perspective:
* Could we collect more data? (generally, the more data, the better) 
* Could we improve our data? 

From a model perspective:
* Is there a better model we could use?
* Could we improve the current model? 

Hyperparameters vs. Parameters
* Parameters = model find these patterns in data
* Hyperparameters = settings on a model you can adjust to (potentially) improve its ability to find patterns

Three ways to adjust hyperparameters:
1. By hand
2. Randomly with RandomSearchCV
3. Exhaustively with GridSearchCV  
 <br>  
 #### When comparing models, you should be careful to make sure they're compared on the same splits of data.

## Tuning hyperparameters BY HAND
<br>

Let's make 3 sets, training, validation and test.

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.get_params()

# These are yhe paramters we can change

 We're going to try and adjust:

* `max_depth`
* `max_features`
* `min_samples_leaf`
* `min_samples_split`
* `n_estimators`

In [None]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    # return a dictionary with the  evaluations
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Shuffle the data 
heart_disease_shuffled = heart_disease.sample(frac=1) # shuffle 100% of the data

# Split into X and y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split into TRAIN TEST and VALIDATIONS sets:
# 70% on training data, 15% on validation and rest 15% on test

train_split = round(0.7 * len(heart_disease_shuffled))
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # NEXT 15% of data

X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[:valid_split]

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Making predictions on the valid set
y_preds = model.predict(X_valid)

# Evaluate on valid set
baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

In [None]:
np.random.seed(42)

# Create a second classifier with different hyperparameters
model_2 = RandomForestClassifier(max_depth=10)
model_2.fit(X_train, y_train)

y_preds_2 = model_2.predict(X_valid)

model_2_metrics = evaluate_preds(y_valid, y_preds_2)

# Very difficult to change these by hand so will use inbuilt scikit learn function

## RandomisedSearchCV: Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
#         "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

np.random.seed(42)

# Split into X & y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid, 
                            n_iter=20, # number of models to try
                            cv=5,
                            verbose=2)

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);

In [None]:
rs_clf.best_params_

In [None]:
# Predicting with the best found model
rs_clf_preds = rs_clf.predict(X_test)

# Evaluating this model with the function we created: 
rs_metrics = evaluate_preds(y_test, rs_clf_preds)

 ## GridSearchCV: Hyperparameter Training 
 
 RandomSearchCV is like brute force method
 because:   
 6 * 5 * 3 * 3 * 5(cross valid=5) => 1350  
 Thats a LOTT for your computer and consider a large dataset !
 
 , but gridSearch will reduce these numbers drastically!

In [None]:
grid
# we must reduce the parameters we are passing here: but how to choose which to keep?

In [None]:
# Make a new grid based on the results of randomsearch: 
grid_2 = {'n_estimators': [100, 200, 500],
          'max_depth': [None],
          'max_features': ['sqrt'],
          'min_samples_split': [6],
          'min_samples_leaf': [1, 2]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# Split into X & y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
model = RandomForestClassifier(n_jobs=1)

# Setup GridSearchCV
gs_model = GridSearchCV(estimator=model,
                      param_grid=grid_2, 
                      cv=5,
                      verbose=2)

# Fit the GridSearchCV version of clf
gs_model.fit(X_train, y_train);

In [None]:
gs_model.best_params_

In [None]:
gs_y_preds = gs_model.predict(X_test)

# evaluate the predictions
gs_metrics = evaluate_preds(y_test, gs_y_preds)

In [None]:
compare_metrics = pd.DataFrame({"baseline": baseline_metrics,
                                "model_2": model_2_metrics,
                                "random search": rs_metrics,
                                "grid search": gs_metrics})

compare_metrics.plot.bar(figsize=(10, 8));

# 6. Saving and loading trained ML models

Two ways to save and load machine learning models:
1. With Python's `pickle` module
2. With the `joblib` module

**Pickle**

In [None]:
import pickle

# Save an extisting model to file
pickle.dump(gs_model, open("gs.pkl", "wb"))

In [None]:
# Load a saved model
loaded_pickle_model = pickle.load(open("gs.pkl", "rb")) # read bit

In [None]:
# Make some predictions
pickle_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)

**Joblib**

In [None]:
from joblib import dump, load

# Save model to file
dump(gs_model, filename="gs_joblib.joblib")

In [None]:
# Import a saved joblib model
loaded_joblib_model = load(filename="gs_joblib.joblib")

In [None]:
# Make and evaluate joblib predictions
joblib_y_preds = loaded_joblib_model.predict(X_test)
evaluate_preds(y_test, joblib_y_preds)

# 7. Putting it all together

In [None]:
data = pd.read_csv("car-sales-extended-missing-data.csv")
data

In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
# Getting data ready: 
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
np.random.seed(42)

# Import data: 
data = pd.read_csv("car-sales-extended-missing-data.csv")

# Dropping target wale missing values
data.dropna(subset=["Price"], inplace=True)

# Define different features and transformer pipeline
categorical_features = ["Make", "Colour"]
# Steps matlab pehle wo impute karega THEN wo onehot encode karega
# ek hi variable se dono karna hai: thus pipeline use
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
# Applying the transformer to the different features in one go: thus a pipeline used
preprocessor = ColumnTransformer(
                    transformers=[
                        ("cat", categorical_transformer, categorical_features),
                        ("door", door_transformer, door_feature),
                        ("num", numeric_transformer, numeric_features)
                    ])

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])

# Split data
X = data.drop("Price", axis=1)
y = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

It's also possible to use `GridSearchCV` or `RandomizedSesrchCV` with our `Pipeline`.

In [None]:
# Use GridSearchCV with our regression Pipeline
from sklearn.model_selection import GridSearchCV

pipe_grid = {
# preprocessor keyword used in model
# num is the transformation of numeric_features using numeric_transformer (inside proprocessor)
# the attriibute of num_feature: imputer
# the strategy used will be : mean or median
# SO IN TOTAL ITS SAYING: 
# in preprocessor, num step, the attriibute of num_feature: imputer, we use stategy: mean/median
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__min_samples_split": [2, 4]    
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

In [None]:
gs_model.score(X_test, y_test)