## Predicting Sale Price for Houses in Ames, IA

I am using the Ames, Iowa dataset containing 2930 observations and 81 features related to house sale prices in Ames, Iowa. If you'd like to browse the various features, take a look at the features [here](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt).
The plan is to train test split the housing data. Afterwards, regression and regularization will be used to compare and analyze the model predicting house prices for Amex, IA.

## 1) Load relevant packages

In [None]:
import math
import numpy as np
import pandas as pd
from matplotlib import cm
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter as FF
from matplotlib.ticker import StrMethodFormatter as SMF
import seaborn as sns
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression as lr
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler

## 2) Load Data

In [None]:
df = pd.read_csv('../input/ames-housing-dataset/AmesHousing.csv')
pd.set_option("display.max_columns", None)
df.head()

In [None]:
df.describe()

In [None]:
df.info()

## 3) Data Cleaning and Feature Engineering
After browsing the various features, the initial plan is to use the columns with little to no missing values. The features with a lot of missing values don't look like variables that have a huge impact on sale price. Various variables like bedroom count, lot area, year built, overall quality, and neighborhood will be plotted against sale price.

In [None]:
df_na = df.isna().sum().to_frame().sort_values(by = 0, axis = 0)
df_na = df_na.rename(columns={0: 'NA Count'})
df_na.T

In [None]:
clean_columns = df_na[df_na['NA Count'] == 0].index.to_list()
df_clean = df[clean_columns]
df_clean.shape

## 4) Exploratory Data Analysis

In [None]:
sns.set()
#Graph 1
g1 = sns.lmplot(data = df_clean, x = 'Lot Area', y = 'SalePrice', hue = 'Bedroom AbvGr',
                  palette = 'viridis_r',ci = None, height = 9, aspect = 16 / 9)

#Axes.
ax = plt.gca()

#Title setup.
ax.set_title('Price vs Lot Area / # of Bedrooms', fontsize = 32)

#X-axis setup.
ax.set_xlabel("Lot Area (sq. ft.)", fontsize = 24)
ax.set_xscale('log')
xlabels = [2500, 5000, 10000, 20000, 40000, 80000, 160000, 320000]
ax.set_xticks(xlabels)
ax.set_xticklabels(xlabels, rotation = 45,ha = 'right')
ax.get_xaxis().set_major_formatter(FF(lambda x, p: format(int(x), ',')))

#Y-axis setup.
ax.set_ylabel("Price", fontsize = 24)
ax.set_ylim(0,800000)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)

#Legend setup.
g1._legend.remove()
ax.legend(loc = 'upper left', title = 'Bedrooms', ncol = 2, title_fontsize = 18, fontsize = 16);

#Bedroom count
bedroom_count = df_clean['Bedroom AbvGr'].value_counts().sort_index().to_frame().rename(columns = {'Bedroom AbvGr': "# of Houses"})
bedroom_count.index.name = "Number of Bedrooms"
bedroom_count.T

In [None]:
#Graph 2
g2 = sns.lmplot(data = df_clean, x = 'Year Built', y = 'SalePrice', hue = 'Overall Qual',
                  palette = 'viridis_r', ci=None, height = 9, aspect = 16 / 9)

#Axes
ax = plt.gca()

#Title
ax.set_title('Price vs Year Built / Overall Quality', fontsize = 32)

#X-axis
ax.set_xlabel("Year Built", fontsize = 24)
ax.set_xlim(1870, 2015)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha = 'right')

#Y-axis
ax.set_ylabel("Price", fontsize = 24)
ax.set_ylim(0, 800000)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)

#Legend
g2._legend.remove()
ax.legend(loc = 'upper left', title = 'Overall House Quality', ncol = 2, title_fontsize = 18, fontsize = 16)

#Overall house quality count
neighborhood_count = df_clean['Overall Qual'].value_counts().sort_index().to_frame().rename(columns = {'Overall Qual': "# of Houses"})
neighborhood_count.index.name = "Overall Quality"
neighborhood_count.T

In [None]:
#Graph 3
xlabels = df_clean.groupby(['Neighborhood'])['SalePrice'].median().sort_values().index
g3 = sns.boxplot(data = df_clean, x = 'Neighborhood', y = 'SalePrice', palette = 'viridis_r', order = xlabels)
plt.gcf().set_size_inches(16, 9)

#Axes
ax = plt.gca()

#Title
ax.set_title('Price vs Neighborhood', fontsize = 24)

#X-axis
ax.set_xlabel("Neighborhood", fontsize = 24)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha = 'right')

#Y-axis
ax.set_ylabel("Price", fontsize = 24)
ax.set_ylim(0, 800000)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)

#Neighborhood count
neighborhood_count = df_clean['Neighborhood'].value_counts().sort_index().to_frame().rename(columns = {'Neighborhood': "# of Houses"})
neighborhood_count.index.name = "Neighborhood"
neighborhood_count.T

In [None]:
#Graph 4
g4 = sns.violinplot(data = df_clean, x = 'Neighborhood', y = 'SalePrice', palette = 'viridis_r', scale = 'width', order = xlabels)
plt.gcf().set_size_inches(16, 9)

#Axes
ax = plt.gca()

#Title
ax.set_title('Price vs Neighborhood', fontsize = 24)

#X-axis
ax.set_xlabel("Neighborhood", fontsize = 24)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha = 'right')

#Y-axis
ax.set_ylabel("Price", fontsize = 24)
ax.set_ylim(0, 800000)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)

#Neighborhood count
neighborhood_count = df_clean['Neighborhood'].value_counts().sort_index().to_frame().rename(columns = {'Neighborhood': "# of Houses"})
neighborhood_count.index.name = "Neighborhood"
neighborhood_count.T

In [None]:
#Graph 5
g5 = sns.heatmap(df_clean[['Lot Area','Overall Qual','Bedroom AbvGr','Overall Cond','Full Bath','Half Bath','1st Flr SF','2nd Flr SF','Pool Area','Open Porch SF','TotRms AbvGrd','Year Built','SalePrice']].corr(),cmap='Blues')

In [None]:
#Graph 6
g6 = sns.histplot(data=df_clean,x='SalePrice')
plt.gcf().set_size_inches(16, 9)

In [None]:
#Graph 7
#Housing prices log normalized
np.log1p(df_clean['SalePrice'])
g7 = sns.histplot(data=df_clean,x=np.log1p(df_clean['SalePrice']))
plt.gcf().set_size_inches(16, 9)

## 5) Data Modeling

<b>Dropped Order and PID features because they're ID variables and log transformed the SalePrice.

In [None]:
df_final = df_clean.drop(['Order','PID'],axis=1)
df_final['SalePrice']=np.log1p(df_clean['SalePrice'])
df_final.set_index('SalePrice',inplace=True)

<b>One-Hot encoding dummy variables

In [None]:
one_hot_encode_cols = df_final.dtypes[df_final.dtypes == np.object]
one_hot_encode_cols = one_hot_encode_cols.index.tolist()

df_final = pd.get_dummies(df_final, columns = one_hot_encode_cols, drop_first=True)
df_final.shape

<b>Log transforming skew variables

In [None]:
float_cols = df_final.columns[df_final.dtypes == np.float]
skew_limit = 0.75
skew_vals = df_final[float_cols].skew()
skew_cols = (skew_vals
             .sort_values(ascending = False)
             .to_frame()
             .rename(columns = {0: 'Skew'})
             .query('abs(Skew) > {}'.format(skew_limit)))
#skew_cols

<b>Train Test Split

In [None]:
X = df_final.reset_index().drop('SalePrice', axis = 1)
y = df_final.index

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
X_train1 = X_train.copy()
y_train1 = y_train.copy()
X_test1 = X_test.copy()
y_test1 = y_test.copy()

lm = lr().fit(X_train1, y_train)
y_pred = lm.predict(X_test1)
lm.score(X_test1,y_test1)

### Regularized Lasso Regression
#### K fold 

In [None]:
kf = KFold(shuffle = True, random_state = 42, n_splits = 5)

#### Optimization Function

In [None]:
def optimize_alpha(alphas, x, y, model, kf):
    
    #Scale and transform x.
    s = StandardScaler()
    x = s.fit_transform(x)
    
    #List of R2.
    r2_scores = []
    
    for alpha in alphas:
        
        reg = model(alpha = alpha, max_iter = 500000)
        y_pred = cross_val_predict(reg, x, y, cv = kf)
        score = r2_score(y, y_pred)
        r2_scores.append(score)
    
    return(r2_scores)

#### Alpha Graph

In [None]:
def alpha_r2_graph(alphas, R2s, xlabels, model):
    
    df = pd.DataFrame(data = {'alpha': alphas,'R2': R2s})
    sns.set()
    #Scatter Plot
    sns.lineplot(data = df, x = 'alpha', y = 'R2', marker = 'o')
    
    #Size
    plt.gcf().set_size_inches(15, 6.92)
    paper_rc = {'lines.linewidth': 2, 'lines.markersize': 6}  
    
    #Axes
    ax = plt.gca()
    
    #Title
    ax.set_title("Hyperparameter Optimization for {} Regression".format(model), fontsize = 24)

    #X-axis
    ax.set_xlabel("α", fontsize = 22)
    ax.set_xscale('log')
    ax.set_xticks(xlabels)
    ax.set_xticklabels(xlabels, rotation = 45, ha = 'right')
    if (model == 'Ridge') :
        ax.get_xaxis().set_major_formatter(FF(lambda x, p: format(int(x), ',')))

    #Y-axis
    ax.set_ylabel("R2", fontsize = 22)
    ylabels = [0, 0.2, 0.4, 0.6, 0.8, 1]
    ax.set_xticks(xlabels)
    
    ax.tick_params(axis = 'both', which = 'major', labelsize = 16)

#### Lasso with L1 Regularization

In [None]:
alphas = list(pd.core.common.flatten([[a / 2, a, 2 * a] for a in np.geomspace(1e-5, 1e1, 7)]))
xlabels = [a for a in np.geomspace(1e-5, 1e1, 7)]

s = StandardScaler()
X_train2 = s.fit_transform(X_train)
y_train2 = y_train.copy()
X_test2 = s.fit_transform(X_test)
y_test2 = y_test.copy()

#R2s and graph
r2_lasso = optimize_alpha(alphas, X_train2, y_train2, Lasso, kf)
alpha_r2_graph(alphas, r2_lasso, xlabels, 'Lasso')

#Lasso Regression
lm_lasso = Lasso(alpha=0.005).fit(X_train2,y_train2)

#### Ridge with L2 Regularization

In [None]:
alphas = list(pd.core.common.flatten([[a / 2, a, 2 * a] for a in np.geomspace(1, 1e6, 7)]))
xlabels = [a for a in np.geomspace(1, 1e6, 7)]

s = StandardScaler()
X_train3 = s.fit_transform(X_train)
y_train3 = y_train.copy()
X_test3 = s.fit_transform(X_test)
y_test3 = y_test.copy()

#Determine R2s and graph.
r2_ridge = optimize_alpha(alphas, X_train3, y_train3, Ridge, kf)
alpha_r2_graph(alphas, r2_ridge, xlabels, 'Ridge')

#Ridge Regression
lm_ridge = Ridge(alpha=500).fit(X_train3,y_train3)

#### Summary Function

In [None]:
def summary_table(models, Xs, Y) :

    index = ['Linear','Lasso', 'Ridge']
    R2 = []
    ADJUSTED_R2 = []
    RMSE = []
    MAE = []

    for i in range(3):
        y_pred = models[i].predict(Xs[i])
        
        #R2.
        r2 = r2_score(Y, y_pred)
        R2.append(r2)
        
        #Adj R2
        adjusted_r2 = 1.0 - (1.0 - r2) * (len(Y) - 1.0) / (len(Y) - Xs[i].shape[1] - 1.0)
        ADJUSTED_R2.append(adjusted_r2)
        
        #RMSE
        rmse = math.sqrt(mean_squared_error(Y, y_pred))
        RMSE.append(rmse)
                         
        #MAE
        mae = mean_absolute_error(Y, y_pred)
        MAE.append(mae)

    df_summary = pd.DataFrame(data = {'R2': R2,'Adjusted R2': ADJUSTED_R2,'RMSE': RMSE,'MAE': MAE},index = index)
    
    return(df_summary)

#### Train Data

In [None]:
linear_models = [lm, lm_lasso, lm_ridge]
X_trains = [X_train1, X_train2, X_train3]
summary_table(linear_models, X_trains, y_train)

#### Test Data

In [None]:
X_tests = [X_test1, X_test2, X_test3]
summary_table(linear_models, X_tests, y_test)

#### Lasso Regression maintains accuracy while reducing complexity by more than half.

In [None]:
for i in linear_models:
    print(len(X_train.columns[i.coef_ != 0]))