In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

from scipy import stats

In [2]:
df=pd.read_csv('redfin_data.csv')
df.shape

(1656, 23)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1656 entries, 0 to 1655
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sold Price         1649 non-null   float64
 1   Beds               1649 non-null   float64
 2   Baths              1649 non-null   float64
 3   Floors             1632 non-null   float64
 4   Garage Spaces      1656 non-null   int64  
 5   Lot Size (sq ft)   1653 non-null   float64
 6   Home Size (sq ft)  1655 non-null   float64
 7   Year Built         1653 non-null   float64
 8   School Score Avg   1515 non-null   float64
 9   Walk Score         1538 non-null   float64
 10  Transit Score      1538 non-null   float64
 11  Bike Score         1538 non-null   float64
 12  Laundry            1656 non-null   bool   
 13  Heating            1656 non-null   bool   
 14  Air Conditioning   1656 non-null   bool   
 15  Pool               1656 non-null   bool   
 16  Address            1656 

In [4]:
## clean data

df.drop(columns=['Address', 'Sold Status', 'URL'], inplace = True)

df = df.dropna()
df.reset_index(drop=True, inplace=True)

df.columns= df.columns.str.lower()
df.rename(columns={'lot size (sq ft)': 'lot size', 'home size (sq ft)': 'home size'}, inplace=True)

df.drop(df.index[df['city'] == 'SAN JOSE'], inplace=True)

df.drop(df.index[df['city'] == 'EAST PALO ALTO'], inplace=True)

df.drop(df.index[df['county'] == 'SAN MATEO COUNTY'], inplace=True)

df.drop(columns=['county', 'zip code', 'property type'], inplace = True)

df['age of house'] = (df['year built'].max() + 1) - df['year built']

df.drop([126], inplace=True)
df.drop([951,952], inplace=True)
df.drop([1410], inplace=True)

df.drop(columns=['year built'], inplace = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1294 entries, 0 to 1439
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sold price        1294 non-null   float64
 1   beds              1294 non-null   float64
 2   baths             1294 non-null   float64
 3   floors            1294 non-null   float64
 4   garage spaces     1294 non-null   int64  
 5   lot size          1294 non-null   float64
 6   home size         1294 non-null   float64
 7   school score avg  1294 non-null   float64
 8   walk score        1294 non-null   float64
 9   transit score     1294 non-null   float64
 10  bike score        1294 non-null   float64
 11  laundry           1294 non-null   bool   
 12  heating           1294 non-null   bool   
 13  air conditioning  1294 non-null   bool   
 14  pool              1294 non-null   bool   
 15  city              1294 non-null   object 
 16  age of house      1294 non-null   float64


In [None]:
## select target and features

x = df.drop(columns=['sold price', 'garage spaces', 'year built', 
                     'walk score', 'transit score', 'bike score', 
                     'laundry', 'heating', 'air conditioning', 'pool', 'city'])
y = np.log10(df['sold price'])

In [None]:
## train-test split 60-20-20

x, x_test, y, y_test = train_test_split(x, y,
                                        test_size = 0.2,
                                        random_state = 42)

x_train, x_valid, y_train, y_valid = train_test_split(x, y,
                                                  test_size = 0.25,
                                                  random_state = 42)

## instantiate and fit StandardScaler()
std = StandardScaler()
std.fit(x_train.values)

## apply the scaler to the train set
x_tr = std.transform(x_train.values)

## apply the scaler to the validation set
x_val = std.transform(x_valid.values)

## instantiate and fit Linear Regression to train set
lr = LinearRegression()

lr.fit(x_tr,y_train)

## r2 score of train set
r2_train = lr.score(x_tr,y_train)

## adj r2 score of train
adj_r2_train = 1-(1-r2_train)*(x_tr.shape[0]-1)/(x_tr.shape[0]-x_tr.shape[1]-1)

## r2 score of validation set
r2_val = lr.score(x_val,y_valid)

## adj r2 score of validation set
adj_r2_val = 1-(1-r2_val)*(x_val.shape[0]-1)/(x_val.shape[0]-x_val.shape[1]-1)

lr_predict = lr.predict(x_val)
lr_resid = y_valid - lr_predict

mae_train = round(mean_absolute_error(10**y_train, 10**lr.predict(x_tr)),2)
mae_train

mae_val = round(mean_absolute_error(10**y_valid, 10**lr.predict(x_val)),2)
mae_val

rmse_train = round(mean_squared_error(10**y_train, 10**lr.predict(x_tr), squared=False),2)
rmse_train

rmse_val = round(mean_squared_error(10**y_valid, 10**lr.predict(x_val), squared=False),2)
rmse_val

print('SIMPLE LINEAR REGRESSION SUMMARY: ')

print('\nr2 (Train Data): ', r2_train)
print('r2 (Validation Data): ', r2_val)

print('\nAdjusted r2 (Train Data): ', adj_r2_train)
print('Adjusted r2 (Validation Data): ', adj_r2_val)

print('\nMean Absolute Error (Train Data): ', mae_train)
print('Mean Absolute Error (Validation Data): ', mae_val)

print('\nRoot Mean Squared Error (Train Data): ', rmse_train)
print('Root Mean Squared Error (Validation Data): ', rmse_val)


In [None]:
lr_predict = lr.predict(x_val)
lr_resid = y_valid - lr_predict

In [None]:
def diagnostic_plots(x, y, y_pred):    
    
    plt.figure(figsize=(20,6))
    plt.tight_layout(pad=10.0)
    
    plt.subplot(1, 3, 1)
    plt.scatter(y_pred,y)
    plt.plot([0, 9e6], [0, 9e6], color='r', linestyle='-')
    plt.title('Actual vs. True Sale Price', size=18)
    plt.xlabel('Predicted Sale Price', size=16)
    plt.ylabel('Actual Sale Price', size=16)
#     plt.ticklabel_format(axis='both', style='sci', scilimits=(0,0))
    plt.xticks(size=12)
    plt.yticks(size=12)
#     plt.grid(True, linestyle='--')
    
    plt.subplot(1, 3, 2)
    res = y - y_pred
    plt.scatter(y_pred, res)
    plt.axhline(y=0.0, color='r', linestyle='-')
    plt.title('Residuals Plot', size=18)
    plt.xlabel('Predicted', size=16)
    plt.ylabel('Residual', size=16)
#     plt.ticklabel_format(axis='both', style='sci', scilimits=(0,0))
    plt.xticks(size=12)
    plt.yticks(size=12)
#     plt.grid(True, linestyle='--')
    
    plt.subplot(1, 3, 3)
    #Generates a probability plot of sample data against the quantiles of a 
    # specified theoretical distribution 
    stats.probplot(res, dist='norm', plot=plt)
    plt.title('Normal Q-Q plot', size=18)
    plt.xlabel('Theoretical Quantiles', size=16)
    plt.ylabel('Ordered Values', size=16)
#     plt.ticklabel_format(axis='y', style='sci', scilimits=(0,0))
    plt.xticks(size=12)
    plt.yticks(size=12)
#     plt.grid(True, linestyle='--')

In [None]:
diagnostic_plots(x_val, 10**y_valid, 10**lr_predict)

In [None]:
print('Standardized Coefficients (Train Data) of Feature Variables: ')
lr_vif_train = pd.DataFrame()
lr_vif_train['Features'] = x_train.columns
lr_vif_train['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
lr_vif_train


In [None]:

print('Standardized Coefficients (Validation Data) of Feature Variables: ')
lr_vif_val = pd.DataFrame()
lr_vif_val['Features'] = x_valid.columns
lr_vif_val['VIF'] = [variance_inflation_factor(x_valid.values, i) for i in range(x_valid.shape[1])]
lr_vif_val

In [None]:
list(zip(x_train.columns, lr.coef_))