In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/briggsc1-erau.edu/Downloads/housing.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/home/briggsc1-erau.edu/Downloads/housing.csv'

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
df = df[['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity']]

In [None]:
df.describe()

In [None]:
# let's model median_house_value as our target variable
# I'll pick these six features
# I don't think lat and long will be useful for a linear model,
# and I will omit the string column ocean_proximity
features = ['housing_median_age', 'total_rooms','total_bedrooms',
            'population', 'households', 'median_income']
target = ['median_house_value']

In [None]:
# drop columns we won't use
df = df[features+target]

In [None]:
# drop rows with missing data in one of the remaining columns
df = df.dropna()

In [None]:
# let's do a train-val-test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df[features],
                                                 df[target],
                                                 test_size = 0.4,
                                                 random_state = 0)

In [None]:
x_val,x_test,y_val,y_test = train_test_split(x_test,
                                             y_test,
                                             test_size = 0.5,
                                             random_state = 0)

In [None]:
len(x_train)/len(x_test)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(x_train,y_train)
lr.score(x_val,y_val)

In [None]:
# let's look at the coefficients
for _ in zip(features,lr.coef_[0]):
    print(_[0],_[1])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig,ax = plt.subplots()
ax = sns.heatmap(df.corr(),vmin=-1,vmax=1,cmap="vlag",annot = True)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_title('bay area housing price heat map');

In [None]:
from itertools import combinations

In [None]:
coef_dct = dict(zip(features,[[],[],[],[],[],[]]))

In [None]:
for i in range(1,len(features)+1):
    for feature_subset in combinations(features,i):
        feature_subset = list(feature_subset)
        df_tmp = df[feature_subset+target]
        df_tmp = df_tmp.dropna()
        x_train,x_test,y_train,y_test = train_test_split(df_tmp[feature_subset],
                                                         df_tmp[target],
                                                         test_size = 0.4,
                                                         random_state = 0)
        x_val,x_test,y_val,y_test = train_test_split(x_test,
                                                     y_test,
                                                     test_size = 0.5,
                                                     random_state = 0)
        lr = LinearRegression()
        lr.fit(x_train,y_train)
        lr.score(x_val,y_val)
        for item in zip(feature_subset, lr.coef_[0]):
            coef_dct[item[0]].append(item[1])

In [None]:
df_coef = pd.DataFrame(coef_dct)

In [None]:
features

In [None]:
df_coef[features[1:-1]].boxplot()

In [None]:
# the wild fluctuation in coefficients per feature proves the features are dependent.
# the boxplot shows the impact of feature dependence on our ability to interpret a linear
# model.
# we will do feature engineering to produce a set of six independent features from the
# given feature set.
# specifically, we'll use Principal Component Analysis (PCA)
# PCA assumed the variables are jointly normally distributed.


In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=6) # controls how many principal components are returned
# this gives us an object we can fit to some data

In [None]:
pca.fit(x_train)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
# this looks too good to be true because it is. we didn't normalize our data.
# that trivializes the variation found in variables relatively smaller scales

In [None]:
df.describe()

In [None]:
# let's normalize the data first
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
# I fit the scaler to my training data
scaler.fit(x_train)

In [None]:
x_train.head()

In [None]:
pd.DataFrame(scaler.transform(x_train)).describe()

In [None]:
x_train_sc = scaler.transform(x_train)

In [None]:
# fits the PCA to the scaled data
pca = PCA()
pca.fit(x_train_sc)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
# exercise: see how the coefficients of the PCs vary in linear models
# trained on all subsets of PCs.
pca_features = ['pc'+str(i) for i in range(1,7)]
df_pca = pd.DataFrame(pca.transform(x_train_sc),columns = pca_features)
coef_dct = dict(zip(pca_features,[[],[],[],[],[],[]]))

In [None]:
for i in range(1,len(pca_features)+1):
    for feature_subset in combinations(pca_features,i):
        feature_subset = list(feature_subset)
        df_tmp = df_pca[feature_subset]
        df_tmp = df_tmp.dropna()
        lr = LinearRegression()
        lr.fit(df_tmp,y_train)
        lr.score(pd.DataFrame(pca.transform(scaler.transform(x_val)),columns=pca_features)[feature_subset],
                 y_val)
        for item in zip(feature_subset, lr.coef_[0]):
            coef_dct[item[0]].append(item[1])

In [None]:
pd.DataFrame(coef_dct)