### Let's start with only the numeric columns in both the training and test datasets.

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [5]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Import the processed train dataset

In [6]:
train = pd.read_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/train_num_cleaned.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,0,60,106.7843,13517,6,8,1976,2005,289.0,533.0,...,0,44,0,0,0,0,0,3,2010,130500
1,1,60,43.0,11492,7,5,1996,1997,132.0,637.0,...,0,74,0,0,0,0,0,4,2009,220000
2,2,20,68.0,7922,5,7,1953,2007,0.0,731.0,...,0,52,0,0,0,0,0,1,2010,109000
3,3,60,73.0,9802,5,5,2006,2007,0.0,0.0,...,100,0,0,0,0,0,0,4,2010,174000
4,4,50,82.0,14235,6,8,1900,1993,0.0,0.0,...,0,59,0,0,0,0,0,3,2010,138500


In [7]:
test = pd.read_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/test_num_cleaned.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,0,190,69.0,9142,6,8,1910,1950,0.0,0,...,440,0,60,112,0,0,0,0,4,2006
1,1,90,76.3298,9662,5,4,1977,1977,0.0,0,...,580,170,0,0,0,0,0,0,8,2006
2,2,60,58.0,17104,7,5,2006,2006,0.0,554,...,426,100,24,0,0,0,0,0,9,2006
3,3,30,60.0,8520,5,6,1923,2006,0.0,0,...,480,0,0,184,0,0,0,0,7,2007
4,4,20,75.05,9500,6,5,1963,1963,247.0,609,...,514,0,76,0,0,185,0,0,7,2009


In [8]:
train = train.drop(columns=['Unnamed: 0'])
train = train.dropna()  # remove rows where numeric data is Null
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2048 entries, 0 to 2047
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MS SubClass      2048 non-null   int64  
 1   Lot Frontage     2048 non-null   float64
 2   Lot Area         2048 non-null   int64  
 3   Overall Qual     2048 non-null   int64  
 4   Overall Cond     2048 non-null   int64  
 5   Year Built       2048 non-null   int64  
 6   Year Remod/Add   2048 non-null   int64  
 7   Mas Vnr Area     2048 non-null   float64
 8   BsmtFin SF 1     2048 non-null   float64
 9   BsmtFin SF 2     2048 non-null   float64
 10  Bsmt Unf SF      2048 non-null   float64
 11  Total Bsmt SF    2048 non-null   float64
 12  1st Flr SF       2048 non-null   int64  
 13  2nd Flr SF       2048 non-null   int64  
 14  Low Qual Fin SF  2048 non-null   int64  
 15  Gr Liv Area      2048 non-null   int64  
 16  Bsmt Full Bath   2048 non-null   float64
 17  Bsmt Half Bath

In [9]:
test = test.drop(columns=['Unnamed: 0'])
test = test.dropna()  # remove rows where numeric data is Null
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MS SubClass      878 non-null    int64  
 1   Lot Frontage     878 non-null    float64
 2   Lot Area         878 non-null    int64  
 3   Overall Qual     878 non-null    int64  
 4   Overall Cond     878 non-null    int64  
 5   Year Built       878 non-null    int64  
 6   Year Remod/Add   878 non-null    int64  
 7   Mas Vnr Area     878 non-null    float64
 8   BsmtFin SF 1     878 non-null    int64  
 9   BsmtFin SF 2     878 non-null    int64  
 10  Bsmt Unf SF      878 non-null    int64  
 11  Total Bsmt SF    878 non-null    int64  
 12  1st Flr SF       878 non-null    int64  
 13  2nd Flr SF       878 non-null    int64  
 14  Low Qual Fin SF  878 non-null    int64  
 15  Gr Liv Area      878 non-null    int64  
 16  Bsmt Full Bath   878 non-null    int64  
 17  Bsmt Half Bath  

### Let's establish a Baseline that our model needs to beat.  We determine the mean of the target, `SalePrice`. First, we need to create a train/test split of cleaned train data.

In [10]:
X = train.drop(columns=['SalePrice'])
y = train['SalePrice']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [12]:
y_test.mean()

np.float64(180722.439453125)

Now we will set y_test to the mean.

In [13]:
baseline_preds = np.full_like(y_test, y_test.mean(), dtype=float)

Now, we can determine what the RMSE of y_test is if we only predict the mean of y_test. If we are going to build a regression model, then we need RMSE$(y_{test}) < $RMSE(baseline)

In [14]:
baseline_mse = mean_squared_error(y_test, baseline_preds)
baseline_rmse = np.sqrt(baseline_mse)
baseline_rmse  # this is the value that we have to beat

np.float64(77819.62846193345)

Instantiate the Linear Regression model

In [15]:
lr = LinearRegression()

Fit the model to the training data

In [16]:
lr.fit(X_train, y_train)

Make predictions on the X_train and X_test

In [17]:
predictions_train = lr.predict(X_train)
predictions_test = lr.predict(X_test)

Calculate $R^2$ score for X_train and X_test



In [18]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.827609081365776, 0.8661859270804956)

Determine the RMSE of predictions vs. actuals


In [19]:
MSE = mean_squared_error(y_train, predictions_train)
RMSE_train = np.sqrt(MSE)
RMSE_train

np.float64(33088.123966447376)

In [20]:
MSE = mean_squared_error(y_test, predictions_test)
RMSE_test = np.sqrt(MSE)
RMSE_test

np.float64(28466.891613097247)

Calculate improvement over baseline

In [21]:
improvement = (baseline_rmse-RMSE_test) / baseline_rmse
improvement

np.float64(0.6341939408381753)

The model's predictions are a 63.4% improvement over the baseline.

In [22]:
predictions = lr.predict(test)

# convert numpy array to Pandas dataframe
predictions = pd.DataFrame(predictions, columns=['saleprice'])
predictions.head()

Unnamed: 0,saleprice
0,121249.72816
1,156705.146681
2,227280.181586
3,117150.014455
4,201067.351437


In [23]:
# save predictions to csv file
predictions.to_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/predictions_numeric.csv')

### Now, let's include the categorical variables along with the numeric



In [24]:
train_num_cat = pd.read_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/train_num_cat_cleaned.csv')
train_num_cat.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Misc Feature_nan,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0,60.0,106.7843,13517.0,6.0,8.0,1976.0,2005.0,289.0,533.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,60.0,43.0,11492.0,7.0,5.0,1996.0,1997.0,132.0,637.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,20.0,68.0,7922.0,5.0,7.0,1953.0,2007.0,0.0,731.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,60.0,73.0,9802.0,5.0,5.0,2006.0,2007.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,50.0,82.0,14235.0,6.0,8.0,1900.0,1993.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
test_num_cat = pd.read_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/test_num_cat_cleaned.csv')
test_num_cat.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Misc Feature_nan,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0,190,69.0,9142,6,8,1910,1950,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,90,76.3298,9662,5,4,1977,1977,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,60,58.0,17104,7,5,2006,2006,0.0,554,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,30,60.0,8520,5,6,1923,2006,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,20,75.05,9500,6,5,1963,1963,247.0,609,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
train_num_cat = train_num_cat.drop(columns=['Unnamed: 0'])
train_num_cat = train_num_cat.dropna()  # remove rows where numeric data is Null

In [27]:
test_num_cat = test_num_cat.drop(columns=['Unnamed: 0'])
test_num_cat = test_num_cat.dropna()  # remove rows where numeric data is Null

In [28]:
X = train_num_cat.drop(columns=['SalePrice'])
y = train_num_cat['SalePrice']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

Instantiate a Linear Regression model

In [30]:
lr_num_cat = LinearRegression()

Fit the model to the training data


In [31]:
lr_num_cat.fit(X_train, y_train)

Make predictions on the train and test datasets

In [32]:
predictions_train = lr_num_cat.predict(X_train)
predictions_test = lr_num_cat.predict(X_test)

In [33]:
predictions_train

array([220627.60205927, 131541.42960026, 178306.31132536, ...,
       138818.96999405, 146578.84446267, 178904.87474314])

In [34]:
predictions_test

array([163664.64235859, 319244.33620105, 420871.20782978, 127188.71442247,
       201534.34650994, 213015.66919003, 141321.28089111,  78714.19129246,
        86566.15338773, 303057.83685398, 175337.21948314, 212741.24359182,
       211194.73267061, 183133.90496742, 294332.592764  ,  95352.55679174,
       376208.73159803, 125868.17546229, 455169.46260791, 139211.24800841,
       184661.05780679, 122945.75704582, 236020.41892462, 194934.08942148,
       212884.32506519, 152801.95506441,  99010.6815161 , 281092.74421595,
       198883.08369896,  85509.61940772, 178823.37295805, 187717.55814029,
       152480.01357407, 246072.41626404, 228829.14375066, 192746.33334064,
       215848.27353956, 153974.80033151, 221823.89225992, 210010.86421905,
       236279.58416892, 306236.3912959 , 228616.78469071, 156118.72408577,
       204026.05429658, 100655.1133665 , 197995.09806249, 127053.42830483,
       210639.86091302, 114672.2011686 , 339795.16711591, 194160.18729902,
       183279.64909867, 1

Calculate $R^2$ score for X_train and X_test

In [35]:
lr_num_cat.score(X_train, y_train), lr_num_cat.score(X_test, y_test)

(0.9448438203994707, 0.9215957982735266)

Determine the RMSE of predictions vs. actuals

In [36]:
MSE = mean_squared_error(y_train, predictions_train)
RMSE_train = np.sqrt(MSE)
RMSE_train

np.float64(18715.957087838087)

In [37]:
MSE = mean_squared_error(y_test, predictions_test)
RMSE_test = np.sqrt(MSE)
RMSE_test

np.float64(21790.07984864892)

Calculate improvement over baseline

In [38]:
improvement = (baseline_rmse-RMSE_test) / baseline_rmse
improvement

np.float64(0.7199924970175379)

Generate predictions for the test_num_cat dataset

In [39]:
predictions = lr_num_cat.predict(test_num_cat)
predictions = pd.DataFrame(predictions, columns=['saleprice'])
predictions.to_csv('/gdrive/MyDrive/Coding Temple/7 - Machine Learning Regression/Coding-Temple-M7-Project-Home-Price-Predictions/data/predictions_numeric_categorical.csv')