### Let's start with only the numeric columns in the training dataset.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Import the processed train dataset

In [None]:
train = pd.read_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/train_num_cleaned_v2.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,0,20,141.0,31770,6,5,1960,1960,112.0,639.0,...,210,62,0,0,0,0,0,5,2010,215000
1,1,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,140,0,0,0,120,0,0,6,2010,105000
2,2,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,393,36,0,0,0,0,12500,6,2010,172000
3,3,20,93.0,11160,7,5,1968,1968,0.0,1065.0,...,0,0,0,0,0,0,0,4,2010,244000
4,4,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,212,34,0,0,0,0,0,3,2010,189900


In [None]:
train = train.drop(columns=['Unnamed: 0'])
train = train.dropna()  # remove rows where numeric data is Null
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MS SubClass      2927 non-null   int64  
 1   Lot Frontage     2927 non-null   float64
 2   Lot Area         2927 non-null   int64  
 3   Overall Qual     2927 non-null   int64  
 4   Overall Cond     2927 non-null   int64  
 5   Year Built       2927 non-null   int64  
 6   Year Remod/Add   2927 non-null   int64  
 7   Mas Vnr Area     2927 non-null   float64
 8   BsmtFin SF 1     2927 non-null   float64
 9   BsmtFin SF 2     2927 non-null   float64
 10  Bsmt Unf SF      2927 non-null   float64
 11  Total Bsmt SF    2927 non-null   float64
 12  1st Flr SF       2927 non-null   int64  
 13  2nd Flr SF       2927 non-null   int64  
 14  Low Qual Fin SF  2927 non-null   int64  
 15  Gr Liv Area      2927 non-null   int64  
 16  Bsmt Full Bath   2927 non-null   float64
 17  Bsmt Half Bath

### Let's establish a Baseline that our model needs to beat.  We determine the mean of the target, `SalePrice`. First, we need to create a train/test split of cleaned train data.

In [None]:
X = train.drop(columns=['SalePrice'])
y = train['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
y_test.mean()

np.float64(182109.30147895336)

Now we will set y_test to the mean.

In [None]:
baseline_preds = np.full_like(y_test, y_test.mean(), dtype=float)

Now, we can determine what the RMSE of y_test is if we only predict the mean of y_test. If we are going to build a regression model, then we need RMSE$(y_{test}) < $RMSE(baseline)

In [None]:
baseline_mse = mean_squared_error(y_test, baseline_preds)
baseline_rmse = np.sqrt(baseline_mse)
baseline_rmse  # this is the value that we have to beat

np.float64(82146.8273197021)

Instantiate the Linear Regression model

In [None]:
lr = LinearRegression()

Fit the model to the training data

In [None]:
lr.fit(X_train, y_train)

Make predictions on the X_train and X_test

In [None]:
predictions_train = lr.predict(X_train)
predictions_test = lr.predict(X_test)

Calculate $R^2$ score for X_train and X_test



In [None]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.8492543048905168, 0.7907883148041949)

Determine the RMSE of predictions vs. actuals


In [None]:
MSE = mean_squared_error(y_train, predictions_train)
RMSE_train = np.sqrt(MSE)
RMSE_train

np.float64(30618.72596512247)

In [None]:
MSE = mean_squared_error(y_test, predictions_test)
RMSE_test = np.sqrt(MSE)
RMSE_test

np.float64(37573.682705037754)

Calculate improvement over baseline

In [None]:
improvement = (baseline_rmse-RMSE_test) / baseline_rmse
improvement

np.float64(0.5426033611887762)

The model's predictions are a 54.3% improvement over the baseline.

### Now, let's include the categorical variables along with the numeric



In [None]:
train_num_cat = pd.read_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/train_num_cat_cleaned_v2.csv')
train_num_cat.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,0,20.0,141.0,31770.0,6.0,5.0,1960.0,1960.0,112.0,639.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,20.0,80.0,11622.0,5.0,6.0,1961.0,1961.0,0.0,468.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,20.0,81.0,14267.0,6.0,6.0,1958.0,1958.0,108.0,923.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,20.0,93.0,11160.0,7.0,5.0,1968.0,1968.0,0.0,1065.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,60.0,74.0,13830.0,5.0,5.0,1997.0,1998.0,0.0,791.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
train_num_cat = train_num_cat.drop(columns=['Unnamed: 0'])
train_num_cat = train_num_cat.dropna()  # remove rows where numeric data is Null

In [None]:
X = train_num_cat.drop(columns=['SalePrice'])
y = train_num_cat['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

Instantiate a Linear Regression model

In [None]:
lr_num_cat = LinearRegression()

Fit the model to the training data


In [None]:
lr_num_cat.fit(X_train, y_train)

Make predictions on the train and test datasets

In [None]:
predictions_train = lr_num_cat.predict(X_train)
predictions_test = lr_num_cat.predict(X_test)

In [None]:
predictions_train

array([137000.00000001, 116065.31004137, 142518.21002726, ...,
       205891.22108657, 138883.49368881, 136169.65840947])

In [None]:
predictions_test

array([322749.91937897, 132950.94310076, 193725.26922256, 201404.22269769,
       293632.8222569 , 144296.04524068, 128348.65072744, 179331.22851117,
       131309.27035005, 186971.25345343, 164577.60474286, 120130.00778774,
       160447.42864707, 127454.49572599, 203227.91354392, 254704.52117241,
       120506.68184708, 134292.57833679, 257126.2743515 , 117017.27694748,
       248068.68140166, 159337.45656882, 327580.44814258, 149458.69343511,
       139332.45341383, 120897.14150321, 209299.1254997 , 173262.78684789,
       183479.67094534, 105449.40413894, 218887.30790552, 120017.07931814,
       145643.17827141, 193780.94653127, 119924.62435616, -23221.36117306,
        94324.65820064, 341856.27581297,  76794.22653974, 134347.31967224,
       189519.36570113, 205614.32987023, 184650.1794613 , 158886.48737746,
       229519.3637162 , 282176.9290079 , 125004.23994529, 196291.0281394 ,
       270812.01614315, 225630.43822093, 157288.22118212,  91574.76413841,
       105642.51721023, 2

Calculate $R^2$ score for X_train and X_test

In [None]:
lr_num_cat.score(X_train, y_train), lr_num_cat.score(X_test, y_test)

(0.9481506902737175, 0.8183527203779335)

Determine the RMSE of predictions vs. actuals

In [None]:
MSE = mean_squared_error(y_train, predictions_train)
RMSE_train = np.sqrt(MSE)
RMSE_train

np.float64(17957.097719765614)

In [None]:
MSE = mean_squared_error(y_test, predictions_test)
RMSE_test = np.sqrt(MSE)
RMSE_test

np.float64(35011.05871896582)

Calculate improvement over baseline

In [None]:
improvement = (baseline_rmse-RMSE_test) / baseline_rmse
improvement

np.float64(0.573799014991675)