![](logo1.jpg)

# **shAI Training 2023 | Level 1**

## Task #8 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

# 1- Select and Train a Model

# Let’s first train a LinearRegression model

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


In [None]:
housing_prepared.shape

(16512, 16)

# First try it out on a few instances from the training set:


In [None]:
some_data = housing.iloc[:5]
some_labels = housing.iloc[:5]


In [None]:
housing_prediction=lin_reg.predict(housing_prepared)
housing_prediction[:5].round(-2)
housing_labels[:5]

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

# measure this regression model’s RMSE on the whole training set
* sing Scikit-Learn’s mean_squared_error() function:

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

67593.20745775253


# judge on the RMSE result for this model
The error is too big that means there is an underfitting it happens when the model isn't powerful enough

your answer goes here

# Let’s train a Decision Tree Regressor model
## more powerful model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor(random_state = 42)
tree_reg.fit(housing_prepared,housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

# Now evaluate the model on the training set
* using Scikit-Learn’s mean_squared_error() function:

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

# Explaine this result
it is example of overfitting and it appears clearly in cross validation

your answer goes here

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring ="neg_mean_squared_error",cv = 10)
tree_rmse_scores = np.sqrt(-scores)

2- display the resultant scores and calculate its Mean and Standard deviation

In [None]:
print("Scores: ", tree_rmse_scores)
print("Mean: ", tree_rmse_scores.mean())
print("Standard Deviation: ", tree_rmse_scores.std())

Scores:  [65312.86044031 70581.69865676 67849.75809965 71460.33789358
 74035.29744574 65562.42978503 67964.10942543 69102.89388457
 66876.66473025 69735.84760006]
Mean:  68848.18979613911
Standard Deviation:  2579.6785558576307


3-repaet the same steps to compute the same scores for the Linear Regression  model

*notice the difference between the results of the two models*

In [None]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring ="neg_mean_squared_error",cv = 10)
lin_rmse_scores = np.sqrt(-scores)

print("Scores: ", lin_rmse_scores)
print("Mean: ", lin_rmse_scores.mean())
print("Standard Deviation: ", lin_rmse_scores.std())

Scores:  [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Mean:  67828.38677377408
Standard Deviation:  2468.0913950652275


## Let’s train one last model the RandomForestRegressor.

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,housing_prepared, housing_labels,scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [None]:
print("Scores: ", forest_rmse_scores)
print("Mean: ", forest_rmse_scores.mean())
print("Standard Deviation: ", forest_rmse_scores.std())

Scores:  [46846.98837087 51739.7024396  49990.35604603 51890.65222759
 52806.57876113 47232.26690776 47855.78909951 50352.36489744
 49497.08767604 50139.87035109]
Mean:  49835.16567770626
Standard Deviation:  1918.3982632661302


# Save every model you experiment with
*using the joblib library*

In [None]:
import joblib
joblib.dump(lin_reg,'line_reg_model.pkl')
joblib.dump(tree_reg,'tree_ref_model.pkl')
joblib.dump(forest_reg,'forest_reg_model.pkl')

['forest_reg_model.pkl']

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor
*It may take a long time*

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
              {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},
              {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}
]
forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(forest_reg,param_grid,cv = 5, scoring = 'neg_mean_squared_error',return_train_score = True)
grid_search.fit(housing_prepared, housing_labels)

with the evaluation scores

In [None]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

# Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [None]:
grid_search.best_estimator_

2-display these importance scores next to their corresponding attribute names:

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

64878.27480854276 {'max_features': 2, 'n_estimators': 3}
55391.003575336406 {'max_features': 2, 'n_estimators': 10}
52721.66494842234 {'max_features': 2, 'n_estimators': 30}
58541.12715494087 {'max_features': 4, 'n_estimators': 3}
51623.59366665994 {'max_features': 4, 'n_estimators': 10}
49787.65951361993 {'max_features': 4, 'n_estimators': 30}
58620.88234614251 {'max_features': 6, 'n_estimators': 3}
51645.862673140065 {'max_features': 6, 'n_estimators': 10}
49917.66994061786 {'max_features': 6, 'n_estimators': 30}
58640.96129790229 {'max_features': 8, 'n_estimators': 3}
51650.365581628095 {'max_features': 8, 'n_estimators': 10}
49672.50940389753 {'max_features': 8, 'n_estimators': 30}
61580.24110015614 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53889.80996032937 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
58667.89389226964 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52764.2630869393 {'bootstrap': False, 'max_features': 3, 'n_estimators': 

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [None]:
X_test = test_set.drop(["median_house_value"],axis = 1)
Y_test = test_set["median_house_value"].copy()

2-run your full_pipeline to transform the data

In [None]:
X_test = full_pipeline.fit_transform(X_test)

3-evaluate the final model on the test set

In [None]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(Y_test.values, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

68502.71773483063

# compute a 95% confidence interval for the generalization error
*using scipy.stats.t.interval():*

In [None]:
from scipy import stats

In [None]:
import numpy as np
from scipy import stats

# Assuming you have already calculated the mean squared error (mse)
# Compute the sample size (number of samples in the test set)
sample_size = len(Y_test)  # Replace some_labels with your actual test labels

# Compute the standard deviation of the mean squared error
mse_std = np.sqrt(final_rmse)

# Define the confidence level (e.g., 95%)
confidence_level = 0.95

# Compute the degrees of freedom (sample_size - 1)
degrees_of_freedom = sample_size - 1

# Compute the critical value (t-value) based on the confidence level and degrees of freedom
# For a two-tailed test (which is typical for confidence intervals), divide the confidence level by 2
alpha = 1 - confidence_level
t_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)

# Compute the margin of error
margin_of_error = t_value * mse_std / np.sqrt(sample_size)

# Compute the confidence interval
lower_bound = final_rmse - margin_of_error
upper_bound = final_rmse + margin_of_error

# Print the confidence interval
print("95% Confidence Interval for Generalization Error:")
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)


95% Confidence Interval for Generalization Error:
Lower Bound: 68494.73117893835
Upper Bound: 68510.7042907229


# Great Job!
# #shAI_Club