# End to End Data Science Project Exercise

## Import Libraries

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

%matplotlib inline

## Load the Data

In [29]:
HOUSING_PATH = os.path.join("datasets","housing")

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [30]:
housing_df = load_housing_data()
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [31]:
housing_df['income_cat'] = pd.cut(housing_df['median_income'], bins = [0., 1.5, 3.0, 4.5, 6., np.inf], labels = [1,2,3,4,5])

In [32]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_df, housing_df['income_cat']):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set =housing_df.loc[test_index]

In [33]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)

In [34]:
housing = strat_train_set.drop('median_house_value', axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [36]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or *kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y = None):
        return self # Nothing to do here
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
        population_per_household = X[:, population_ix]/X[:, rooms_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [37]:
num_pipeline = Pipeline(
    [
        ('impute_layer', SimpleImputer(strategy='median')),
        ('compound_attribute_layer', CombinedAttributesAdder()),
        ('std_scaling_layer', StandardScaler())
    ]
)

1. Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters such as kernel = "linear" (with various values for the C hyperparameter) or the kernel="rbf" (with various values for the C and the gamma hyperparameters). How does the best `SVR` predictor perform?

In [38]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing.columns)
num_attribs.remove('ocean_proximity')

cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer(
    [
        ('num_transform_layers', num_pipeline, num_attribs),
        ('cat_attribute_layer', OneHotEncoder(), cat_attribs) 
    ]
)

housing_prepared = full_pipeline.fit_transform(housing)

In [24]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svm_reg = SVR()

param_grid = [
    {'kernel':['linear'], 'C':[1e-1, 1, 1e1]},
    {'kernel':['rbf'], 'gamma':['scale', 'auto'], 'C':[1e-1, 1, 1e1]}
]

grid_search = GridSearchCV(svm_reg, param_grid, cv = 10, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

In [25]:
grid_search.best_params_

{'C': 10.0, 'kernel': 'linear'}

In [26]:
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('median_house_value', axis = 1)
y_test = strat_test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

80655.54259546511

2. Try replacing `GridSearchCV` with `RandomizedSearchCV`

In [39]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(svm_reg, param_grid, cv = 5, scoring = 'neg_mean_squared_error', return_train_score = True)

random_search.fit(housing_prepared, housing_labels)



In [41]:
random_search.best_params_

{'kernel': 'linear', 'C': 10.0}

In [42]:
final_model = random_search.best_estimator_

X_test = strat_test_set.drop('median_house_value', axis = 1)
y_test = strat_test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

80655.54259546511

3. Try adding a transformer in the preparation pipeline to select only the most important attributes

In [51]:
median_house_value = 8

class SelectRelevantAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, num_features = None, median_house_value = 8):
        self.num_features = num_features
        self.median_house_value = median_house_value

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        if self.num_features != None and X.shape[1] < self.num_features:
            return X
        
        median_house_arr = X[:,median_house_value]
        median_house_mf = median_house_arr - median_house_arr.mean()
        sum_of_squares_median = (median_house_mf ** 2).sum()
        corr_array = []
        for col in range(X.shape[1]):
            feature_arr = X[:,col]
            feature_mf = feature_arr - feature_arr.mean()
            sum_of_squares_features = (feature_mf**2).sum()
            if sum_of_squares_features * sum_of_squares_median < 1e-15:
                corr_array.append(0)
            else:
                corr_array.append(np.dot(median_house_mf, feature_mf.T)/np.sqrt(sum_of_squares_median*sum_of_squares_features))
        
        corr_mapping_sorted = sorted(zip(range(X.shape[1]), corr_array), key = lambda x:x[1], reverse = True)
        if self.num_features != None:
            columns_to_drop = [mapping[0] for mapping in corr_mapping_sorted[self.num_features:]]
        else:
            prev_corr = (corr_mapping_sorted[0])[1]
            for index in range(1,len(corr_mapping_sorted)):
                if corr_mapping_sorted[index][1]/prev_corr < 1e-4:
                    break
                prev_corr = corr_mapping_sorted[index][1]
            
            columns_to_drop = [mapping[0] for mapping in corr_mapping_sorted[index:]]
        
        X_fil = np.delete(X, columns_to_drop, axis = 1)
        return X_fil

In [55]:
final_prep_pipeline = Pipeline(
    [
        ('data_prep_layer', full_pipeline),
        ('feature_selection_layer', SelectRelevantAttributes(num_features=10))
    ]
)

housing_prepared = final_prep_pipeline.fit_transform(housing)
housing_prepared.shape

(16512, 10)

4. Try creating a single pipeline that does the full data preparation plus the final prediction.

In [56]:
final_pipeline = Pipeline(
    [
        ('preparation_end_to_end_layer', final_prep_pipeline),
        ('svm_reg',SVR(**random_search.best_params_))
    ]
)

In [57]:
final_pipeline.fit(housing, housing_labels)

In [60]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions: \t", final_pipeline.predict(some_data))
print("Labels: \t", list(some_labels))

Predictions: 	 [250386.79080679 188269.42066871 165461.39259181 199476.62079398]
Labels: 	 [72100.0, 279600.0, 82700.0, 112500.0]


  corr_array.append(np.dot(median_house_mf, feature_mf.T)/np.sqrt(sum_of_squares_median*sum_of_squares_features))


5. Automatically explore some preparation options using GridSearchCV

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_ #Should have taken from previous notebook. Now just assume we have this.
final_pipeline.named_transformers_['cat'].handle_unknown = 'ignore'
param_grid = [
    {'preparation_end_to_end_layer__data_prep_layer__num_transform_layers__impute_layer__strategy': ['mean', 'median','most_frequent'], 'feature_selection__k': list(range(1, len(feature_importances) + 1))}
]

grid_search_prep = GridSearchCV(final_pipeline, param_grid, cv = 5, scoring='neg_mean_squared_error', verbose = 2)
grid_search_prep.fit(housing, housing_labels)