<a href="https://colab.research.google.com/github/Osman-Dawood/Assinment-2/blob/main/Copy_of_02_end_to_end_machine_learning_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

`## Download the Data`

# Setup

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error

## Download the Data

In [3]:
housing = pd.read_csv('https://raw.githubusercontent.com/Osman-Dawood/IAI600-Lab2/refs/heads/main/housing.csv')

In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


## Create a Test Set

In [5]:
 # set income category based on median income
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
    # stratified sampling
split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    # get train and test dataset
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Prepare the Data for Machine Learning Algorithms:

1. Dropping the Target Variable

2. Creating Labels

Feature Set (housing): This DataFrame will be used as the input features for training the machine learning model.

Labels (housing_labels): This Series will be used as the target variable that the model will learn to predict.

In [7]:
# Create a new DataFrame called housing that contains all the features from the strat_train_set except for the target variable median_house_value.
housing_tr= strat_train_set.drop("median_house_value", axis=1)
#  Creates a copy of the selected column to avoid modifying the original DataFrame.
housing_labels = strat_train_set["median_house_value"].copy()

## Data Cleaning
Most machine learning algorithms cannot work with missing features. For example, we noticed earlier that the total_bedrooms
attribute has some missing values.

#In the book 3 options for solution are listed:
```python
housing.dropna(subset=["total_bedrooms"])    # option 1: Get rid of the corresponding districts.
housing.drop("total_bedrooms", axis=1)       # option 2: Get rid of the whole attribute.

median = housing["total_bedrooms"].median()  # option 3: Set the missing values to some value (zero, the mean, the median, etc.). This is called imputation
housing["total_bedrooms"].fillna(median, inplace=True)


We go for option 3 since it is the least destructive, but instead of the
preceding code, we will use a handy Scikit-Learn class: SimpleImputer to find the median of the total_bedrooms, which will be used automatically in the following Transformation Pipelines.

Remove the text attribute (ocean_proximity) because median can only be calculated on numerical attributes:

In [8]:
housing_num = housing_tr.drop("ocean_proximity", axis=1)

Let's create a custom transformer to add extra attributes:



In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

## Transformation Pipelines



Now let's build a pipeline for preprocessing the numerical attributes: impute then scale the input features.

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [12]:
housing_num_tr

array([[-1.04148597e+00,  1.65121567e+00, -1.07430161e+00, ...,
         4.16033851e-01, -1.91016683e-02, -1.04023120e+00],
       [ 6.60426439e-01, -7.80722331e-01,  1.15609077e+00, ...,
        -7.77138503e-01,  1.41077378e-01,  1.53642540e+00],
       [ 9.24946550e-01, -9.04969476e-03, -9.94644741e-01, ...,
         2.13246095e-02, -1.22271870e-03, -5.34213113e-01],
       ...,
       [-7.37038299e-01,  2.21243214e+00,  1.87300261e+00, ...,
         6.99488383e-01, -4.18980414e-02, -3.74116035e-01],
       [ 6.85381167e-01,  4.72661102e-01, -4.37046645e-01, ...,
         2.19502752e-01, -3.87026165e-02, -4.52204351e-02],
       [-1.18123245e+00,  4.35246671e-01,  1.23574764e+00, ...,
         1.13588186e+00, -1.38087995e-01, -5.11968991e-01]])

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing_tr)

In [None]:
housing_prepared

In [None]:
housing_prepared_df = pd.DataFrame(housing_prepared)
housing_prepared_df.head()

In [203]:
housing_prepared.shape

(16512, 16)

# Select and Train a Model

## Training and Evaluating on the Training Set

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [17]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing_tr.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))
print("Compare against the actual values:")
print("Labels:", list(some_labels))

Predictions: [180216.89833975 156687.86492216 107457.51770809 351247.29587033
 166939.28606723]
Compare against the actual values:
Labels: [124900.0, 152800.0, 70200.0, 404300.0, 133500.0]


 Find mean_squared_error:

In [18]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

67815.61655105946

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [20]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing_tr.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", tree_reg.predict(some_data_prepared))
print("Compare against the actual values:")
print("Labels:", list(some_labels))

Predictions: [124900. 152800.  70200. 404300. 133500.]
Compare against the actual values:
Labels: [124900.0, 152800.0, 70200.0, 404300.0, 133500.0]


In [21]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [22]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

In [23]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing_tr.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", forest_reg.predict(some_data_prepared))
print("Compare against the actual values:")
print("Labels:", list(some_labels))

Predictions: [144584.   147590.    77853.   391366.03 140300.  ]
Compare against the actual values:
Labels: [124900.0, 152800.0, 70200.0, 404300.0, 133500.0]


In [25]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18685.28707998014

In [26]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)

In [27]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing_tr.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", svm_reg.predict(some_data_prepared))
print("Compare against the actual values:")
print("Labels:", list(some_labels))

Predictions: [187640.79192513 166865.68629075 175653.4426676  209767.31069935
 169248.97108527]
Compare against the actual values:
Labels: [124900.0, 152800.0, 70200.0, 404300.0, 133500.0]


In [28]:
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

106385.05123428142