<a href="https://colab.research.google.com/github/robitussin/CCADMACL_EXERCISES/blob/main/Exercise1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 1

Use all feature selection methods to find the best features

## Dataset Information

## Features

Number of Instances: 20640

Number of Attributes: 8 numeric, predictive attributes and the target

Attribute Information:

MedInc - median income in block group

HouseAge - median house age in block group

AveRooms - average number of rooms per household

AveBedrms - average number of bedrooms per household

Population - block group population

AveOccup - average number of household members

Latitude - block group latitude

Longitude - block group longitude

## Target
The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

### Imports and Variables

In [281]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

toDrop = "MedHouseVal"
"""
The `y` column.
"""

rmse = dict()
mse = dict()

In [282]:
def heatmap(correlation):
	sns.heatmap(correlation, annot=True, vmin=-1, vmax=1, center=0)
	plt.show()

In [283]:
housing = fetch_california_housing(as_frame=True)
df = pd.concat([housing.data, housing.target], axis=1)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


### 1. Train a regression model using all features

In [284]:
y = df[toDrop].values
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [285]:
# put your answer here
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import pearsonr

# Drop y
xData = df.drop(columns=toDrop)
# Split Data 
xTrain, xTest, yTrain, yTest = train_test_split(xData, y, test_size=0.2, random_state=42)
df.corr().head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465


In [286]:
scaler = MinMaxScaler()
scaler.fit(xTrain.values)
xTrainScaled = scaler.transform(xTrain.values)
xTestScaled = scaler.transform(xTest.values)

# Train train data
modelLr = LinearRegression()
modelLr.fit(xTrainScaled, yTrain)

# Predict test data
yTestPred = modelLr.predict(xTestScaled)

rmse["All Features"] = root_mean_squared_error(yTest, yTestPred)
mse["All Features"] = mean_squared_error(yTest, yTestPred)

yTestPred

array([0.71912284, 1.76401657, 2.70965883, ..., 4.46877017, 1.18751119,
       2.00940251])

In [287]:
# Put your answer here
import statsmodels.api as sm

olsmod = sm.OLS(yTest, xTestScaled).fit()
olsmod.rsquared

np.float64(0.8881749304345696)

In [288]:
df1 = pd.DataFrame({'Actual MedHouseVal Values': yTest.flatten(), 'Predicted MedHouseVal Values': yTestPred.flatten()})
df1

Unnamed: 0,Actual MedHouseVal Values,Predicted MedHouseVal Values
0,0.47700,0.719123
1,0.45800,1.764017
2,5.00001,2.709659
3,2.18600,2.838926
4,2.78000,2.604657
...,...,...
4123,2.63300,1.991746
4124,2.66800,2.249839
4125,5.00001,4.468770
4126,0.72300,1.187511


In [289]:
sse = str(np.sum(np.square(df1["Actual MedHouseVal Values"] - df1["Predicted MedHouseVal Values"])))
print(f"SSE: {sse}")

SSE: 2294.720519413967


In [290]:
coef = modelLr.coef_[0]
coef

np.float64(6.505875925134843)

### 2. Use any filter method to select the best features and train a regression model

In [291]:
# put your answer here
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.5) # Variance threshold
sel = selector.fit(df)
sel_index = sel.get_support()
sel_index

array([ True,  True,  True, False,  True,  True,  True,  True,  True])

In [292]:
df2 = df.iloc[:, sel_index]
df2

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,741.0,2.123209,39.43,-121.32,0.847


In [293]:
y = df2[toDrop].values
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [294]:
xData = df2.drop(columns=toDrop)
xTrain, xTest, yTrain, yTest = train_test_split(xData, y, test_size=0.2, random_state=42)

In [295]:
scaler = MinMaxScaler()
scaler.fit(xTrain.values)
xTrainScaled = scaler.transform(xTrain.values)
xTestScaled = scaler.transform(xTest.values)

# Train train data
modelLr2 = LinearRegression()
modelLr2.fit(xTrainScaled, yTrain)

# Predict test data
yTestPred = modelLr2.predict(xTestScaled)

rmse["Filter"] = root_mean_squared_error(yTest, yTestPred)
mse["Filter"] = mean_squared_error(yTest, yTestPred)

yTestPred

array([0.73558943, 1.75060633, 2.47012592, ..., 4.44434289, 1.30281163,
       1.89404905])

In [296]:
olsmod = sm.OLS(yTest, xTestScaled).fit()
olsmod.rsquared

np.float64(0.8835514901591278)

### 3. Use any wrapper method to select the best features and train a regression model

In [297]:
# put your answer here
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

In [298]:
df3Raw = fetch_california_housing()
df3RawFeatures = pd.DataFrame(df3Raw.data, columns=df3Raw.feature_names)
df3RawTarget = pd.DataFrame(df3Raw.target, columns=['target'])

In [299]:
threshold = (int) (len(df3Raw.feature_names) * (3 / 4)) # the number of most relevant features (75%)
maxDepth = (int) (len(df3Raw.feature_names) * (2 / 4)) # (50%)
modelRfc = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = maxDepth)
selector = RFE(modelRfc, n_features_to_select=threshold, step=1)

In [300]:
selector = selector.fit(df3RawFeatures, df3RawTarget.values.ravel())
selector_ind = selector.get_support()
df3 = df3RawFeatures.iloc[:, selector_ind]
df3.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup', 'Latitude'], dtype='object')

In [301]:
# Drop y
xData = df3
# Split Data 
xTrain, xTest, yTrain, yTest = train_test_split(xData, y, test_size=0.2, random_state=42)

In [302]:
scaler = MinMaxScaler()
scaler.fit(xTrain.values)
xTrainScaled = scaler.transform(xTrain.values)
xTestScaled = scaler.transform(xTest.values)

# Train train data
modelLr3 = LinearRegression()
modelLr3.fit(xTrainScaled, yTrain)

# Predict test data
yTestPred = modelLr3.predict(xTestScaled)

rmse["Wrapper"] = root_mean_squared_error(yTest, yTestPred)
mse["Wrapper"] = mean_squared_error(yTest, yTestPred)

yTestPred

array([1.04675541, 1.53877466, 2.24690965, ..., 4.24983889, 1.67218672,
       1.81933077])

In [303]:
olsmod = sm.OLS(yTest, xTestScaled).fit()
olsmod.rsquared

np.float64(0.8835118024808146)

### 4. Use any embedded method to select the best features by training a regression model with built-in feature selection

In [304]:
# put your answer here
from sklearn.feature_selection import SelectFromModel

model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_rf.fit(df3RawFeatures, df3RawTarget.values.ravel())

sel_sfm = SelectFromModel(model_rf, prefit=True)
sel_sfm_index = sel_sfm.get_support()
df4 = df3RawFeatures.iloc[:, sel_sfm_index]
df3.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup', 'Latitude'], dtype='object')

In [305]:
# Drop y
xData = df4
# Split Data 
xTrain, xTest, yTrain, yTest = train_test_split(xData, y, test_size=0.2, random_state=42)

In [306]:
scaler = MinMaxScaler()
scaler.fit(xTrain.values)
xTrainScaled = scaler.transform(xTrain.values)
xTestScaled = scaler.transform(xTest.values)

# Train train data
modelLr3 = LinearRegression()
modelLr3.fit(xTrainScaled, yTrain)

# Predict test data
yTestPred = modelLr3.predict(xTestScaled)

rmse["Embedded"] = root_mean_squared_error(yTest, yTestPred)
mse["Embedded"] = mean_squared_error(yTest, yTestPred)

yTestPred

array([1.14558484, 1.50694805, 1.91020281, ..., 4.31882017, 1.61380043,
       1.93160886])

### 5. Evaluate the performance of all four models using RMSE or MSE

In [308]:
# put your answer here
for col in rmse:
    print(f"====== {col} =====")
    print(f"RMSE: {rmse[col]}")
    print(f"MSE: {mse[col]}")
    print(f"====== END =====\n")

RMSE: 0.7455813830127761
MSE: 0.555891598695244

RMSE: 0.7398151789804082
MSE: 0.5473264990498133

RMSE: 0.8045916260704176
MSE: 0.6473676847426387

RMSE: 0.8412318369691884
MSE: 0.7076710035305551

