## Importing the necessary libraries and reading in the csv

In [27]:
import itertools
import numpy as np
import pandas as pd 
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.formula.api import ols
df = pd.read_csv('kc_house_data_train.csv',index_col=0)
print(df.columns)
df.head()

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2591820310,20141006T000000,365000.0,4,2.25,2070,8893,2.0,0,0,...,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,7974200820,20140821T000000,865000.0,5,3.0,2900,6730,1.0,0,0,...,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,7701450110,20140815T000000,1038000.0,4,2.5,3770,10893,2.0,0,2,...,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,9522300010,20150331T000000,1490000.0,3,3.5,4560,14608,2.0,0,2,...,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,9510861140,20140714T000000,711000.0,3,2.5,2550,5376,2.0,0,0,...,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050


In [36]:
# splitting the DataFrame into its features and the control column
control = ['price']
features = ['id', 'date', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [76]:
# Splitting the data for training and testing
x_train, x_test, y_train, y_test = train_test_split(df[features], df[control], test_size=0.33, random_state=42)

Getting a general picture of the correlations between features

In [79]:
x_train.corr()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,1.0,0.003491,0.004651,-0.010623,-0.131186,0.018977,0.001158,0.018505,-0.01937,0.006432,-0.011088,-0.001382,0.017431,-0.010054,-0.014391,-0.004202,0.02405,-0.002136,-0.136295
bedrooms,0.003491,1.0,0.50041,0.567651,0.013755,0.16914,-0.013132,0.079205,0.034819,0.352471,0.467327,0.305468,0.157084,0.017996,-0.147437,0.00377,0.13336,0.386121,0.018853
bathrooms,0.004651,0.50041,1.0,0.754987,0.080948,0.502512,0.060162,0.184248,-0.118913,0.665747,0.687031,0.285138,0.512901,0.042966,-0.205735,0.030958,0.230581,0.569915,0.086993
sqft_living,-0.010623,0.567651,0.754987,1.0,0.16874,0.354907,0.103145,0.281996,-0.049325,0.765143,0.874738,0.442893,0.320253,0.04932,-0.199242,0.059472,0.238832,0.760276,0.178258
sqft_lot,-0.131186,0.013755,0.080948,0.16874,1.0,-0.002346,0.021413,0.046814,-0.022434,0.116174,0.190848,-0.005275,0.067293,-0.000475,-0.130346,-0.067614,0.224439,0.139485,0.741002
floors,0.018977,0.16914,0.502512,0.354907,-0.002346,1.0,0.022227,0.026172,-0.256053,0.457141,0.529149,-0.247438,0.486032,0.005488,-0.063541,0.055565,0.13604,0.28361,-0.006742
waterfront,0.001158,-0.013132,0.060162,0.103145,0.021413,0.022227,1.0,0.408778,0.007985,0.087556,0.065078,0.092208,-0.025108,0.106909,0.032429,-0.016856,-0.050259,0.087434,0.030699
view,0.018505,0.079205,0.184248,0.281996,0.046814,0.026172,0.408778,1.0,0.053768,0.247202,0.160583,0.284174,-0.056867,0.099962,0.088569,0.011706,-0.087936,0.277916,0.04493
condition,-0.01937,0.034819,-0.118913,-0.049325,-0.022434,-0.256053,0.007985,0.053768,1.0,-0.129338,-0.150125,0.17608,-0.35177,-0.062953,0.01139,-0.014965,-0.11473,-0.085318,-0.013023
grade,0.006432,0.352471,0.665747,0.765143,0.116174,0.457141,0.087556,0.247202,-0.129338,1.0,0.760529,0.170089,0.446649,0.000303,-0.191068,0.124168,0.206839,0.720725,0.122414


### Level 1 making a linear model one feature at a time and storing the P values in a DataFrame

In [89]:
# checking the p values for each individual column against the control
list_of_features_and_p_val = []
for feature in features:
    formula = 'price~' + feature
    lm = ols(formula,pd.concat([x_train,y_train],axis=1)).fit()
    intercept = lm.pvalues[0]
    p_val = lm.pvalues[1]
    list_of_features_and_p_val.append( [feature, intercept, p_val])
level_1 = pd.DataFrame(list_of_features_and_p_val,columns=['feature', 'intercept', 'p_val']).sort_values(by='p_val')

In [73]:
ols('price~ lat',df).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.091
Model:,OLS,Adj. R-squared:,0.091
Method:,Least Squares,F-statistic:,1740.0
Date:,"Thu, 19 Mar 2020",Prob (F-statistic):,0.0
Time:,21:31:05,Log-Likelihood:,-245540.0
No. Observations:,17290,AIC:,491100.0
Df Residuals:,17288,BIC:,491100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.825e+07,9.3e+05,-41.131,0.000,-4.01e+07,-3.64e+07
lat,8.156e+05,1.96e+04,41.713,0.000,7.77e+05,8.54e+05

0,1,2,3
Omnibus:,16991.39,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1483053.622
Skew:,4.642,Prob(JB):,0.0
Kurtosis:,47.412,Cond. No.,16400.0


In [90]:
level_1

Unnamed: 0,feature,intercept,p_val
3,bathrooms,0.8471342,0.0
4,sqft_living,6.831009e-21,0.0
8,view,0.0,0.0
18,sqft_living15,3.391152e-29,0.0
10,grade,0.0,0.0
11,sqft_above,7.47456e-13,0.0
12,sqft_basement,0.0,3.4048790000000003e-289
2,bedrooms,4.3228499999999995e-30,4.0650980000000004e-247
16,lat,1.251363e-236,5.408479000000001e-243
7,waterfront,0.0,1.914104e-184
