In [1]:
#house_hunters

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [3]:
# Import the data
df_hh = pd.read_csv('Resources/housing_df_clean.csv')

# display the data
df_hh = df_hh.dropna(how='any')
df_hh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 56 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   LotShape       1460 non-null   object 
 5   LotConfig      1460 non-null   object 
 6   LandSlope      1460 non-null   object 
 7   Neighborhood   1460 non-null   object 
 8   Condition1     1460 non-null   object 
 9   BldgType       1460 non-null   object 
 10  HouseStyle     1460 non-null   object 
 11  OverallQual    1460 non-null   int64  
 12  OverallCond    1460 non-null   int64  
 13  YearBuilt      1460 non-null   int64  
 14  YearRemodAdd   1460 non-null   int64  
 15  RoofStyle      1460 non-null   object 
 16  Exterior1st    1460 non-null   object 
 17  Exterior2nd    1460 non-null   object 
 18  ExterQua

In [4]:
# Get the features (everything except the "SalePrice" column)
X = df_hh.copy().drop(columns="SalePrice")
X.head()

#Get only numerical values
X = X.select_dtypes(include='number')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotFrontage    1460 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   OverallQual    1460 non-null   int64  
 4   OverallCond    1460 non-null   int64  
 5   YearBuilt      1460 non-null   int64  
 6   YearRemodAdd   1460 non-null   int64  
 7   BsmtFinSF1     1460 non-null   int64  
 8   BsmtUnfSF      1460 non-null   int64  
 9   TotalBsmtSF    1460 non-null   int64  
 10  1stFlrSF       1460 non-null   int64  
 11  2ndFlrSF       1460 non-null   int64  
 12  GrLivArea      1460 non-null   int64  
 13  BsmtFullBath   1460 non-null   int64  
 14  FullBath       1460 non-null   int64  
 15  HalfBath       1460 non-null   int64  
 16  BedroomAbvGr   1460 non-null   int64  
 17  TotRmsAbvGrd   1460 non-null   int64  
 18  Fireplac

In [5]:
# Get the target column
y = df_hh["SalePrice"].values.reshape(-1,1)
y[0:5]

array([[208500],
       [181500],
       [223500],
       [140000],
       [250000]], dtype=int64)

In [6]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
# Scale the training data
scaler = StandardScaler()

In [8]:
cols_scaled = X_train.columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

In [9]:
X_train_scaled[cols_scaled] = scaler.fit_transform(X_train[cols_scaled])
X_test_scaled[cols_scaled] = scaler.transform(X_test[cols_scaled])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [10]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [11]:
# Calculate vif for the dataframe

calc_vif(X_train_scaled).sort_values("VIF")

Unnamed: 0,variables,VIF
25,MoSold,1.041276
26,YrSold,1.045525
23,OpenPorchSF,1.201807
22,WoodDeckSF,1.204576
24,EnclosedPorch,1.254037
2,LotArea,1.254714
0,MSSubClass,1.415551
18,Fireplaces,1.504911
1,LotFrontage,1.517576
4,OverallCond,1.523915


In [12]:
# Create another X variable by dropping the 5 columns with the highest VIF scores

X_train_vif = X_train_scaled.drop(columns=['1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotalBsmtSF', 'BsmtFinSF1'])
X_test_vif = X_test_scaled.drop(columns=['1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotalBsmtSF', 'BsmtFinSF1'])
# Recalculate the VIF scores
calc_vif(X_train_vif).sort_values('VIF')

Unnamed: 0,variables,VIF
20,MoSold,1.036329
21,YrSold,1.042044
18,OpenPorchSF,1.1775
17,WoodDeckSF,1.179747
2,LotArea,1.225889
19,EnclosedPorch,1.243184
0,MSSubClass,1.322542
13,Fireplaces,1.399435
10,HalfBath,1.422446
1,LotFrontage,1.467158
