## Sample Kaggle Submission

### Importing

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Loading raw data

In [14]:
df_train = pd.read_csv("./datasets/train.csv")
df_test = pd.read_csv("./datasets/test.csv") # Missing Sale Prices for submission

In [7]:
df_train.head(2)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000


## CLEANING THE DATA

In [15]:
df_train.set_index('Id',inplace=True)
df_test.set_index('Id',inplace=True)

## Sloppy cleaning by filling NaNs with 0

In [16]:
df_train.fillna(0, inplace = True)
df_test.fillna(0, inplace = True)

## Feature Engineering
## EDA

### Preprocessing and Modeling

HINT: log prices

Remember to create function preprocessing so that testing data can also be quckly preprocessed before feeding into model

In [18]:
# Extract desired features
features = ["Overall Qual","Lot Area","Street"]
X = df_train[features]
y = df_train["SalePrice"]

In [19]:
X.head(3)

Unnamed: 0_level_0,Overall Qual,Lot Area,Street
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
109,6,13517,Pave
544,7,11492,Pave
153,5,7922,Pave


In [21]:
# Convert categorical data into processable numbers (dummie variables)
pd.get_dummies(X, columns = ["Street"]).head()

Unnamed: 0_level_0,Overall Qual,Lot Area,Street_Grvl,Street_Pave
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
109,6,13517,0,1
544,7,11492,0,1
153,5,7922,0,1
318,5,9802,0,1
255,6,14235,0,1


In [22]:
X = pd.get_dummies(X, columns = ["Street"])

In [None]:
X.head()

### Train test split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 666) # default to 25% split

## Baseline model (null)
Assumes the average is the best guess

In [31]:
# Import dummy regressor
from sklearn.dummy           import DummyRegressor

# Instantiate: creates a dummy regression that always predicts the mean of the target
base_mean = DummyRegressor(strategy='mean')

# Fit the "model"
base_mean = base_mean.fit(X_train, y_train)

# Get predictions for our testing set (not kaggle testing set)
y_hat_base_train = base_mean.predict(X_train)
y_hat_base_test = base_mean.predict(X_test)

# Get RMSE
from sklearn.metrics import mean_squared_error
print("Our Train RMSE Score for our Base Model is:",  np.sqrt(mean_squared_error(y_hat_base_train, y_train)))
print("Our Test RMSE Score for our Base Model is:",  np.sqrt(mean_squared_error(y_hat_base_test, y_test)))

Our Train RMSE Score for our Base Model is: 78752.32671859932
Our Test RMSE Score for our Base Model is: 80681.79566162577


### Our Model

In [33]:
lr = LinearRegression()

model = lr.fit(X_train, y_train)

In [34]:
y_hat_lr_train = model.predict(X_train)
y_hat_lr_test  = model.predict(X_test)

In [35]:
print("Our Train RMSE Score for our New Model is:",  np.sqrt(mean_squared_error(y_hat_lr_train, y_train)))
print("Our Test RMSE Score for our New Model is:",  np.sqrt(mean_squared_error(y_hat_lr_test, y_test)))

Our Train RMSE Score for our Base Model is: 44560.67847161991
Our Test RMSE Score for our Base Model is: 45071.78212351427


## Prepare in a nice format to submit to Kaggle

In [38]:
X_kaggle = df_test[features]

In [39]:
X_kaggle = pd.get_dummies(X_kaggle, columns = ["Street"])

In [40]:
X_kaggle.head(3)

Unnamed: 0_level_0,Overall Qual,Lot Area,Street_Grvl,Street_Pave
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2658,6,9142,0,1
2718,5,9662,0,1
2414,7,17104,0,1


In [41]:
# "Recreating" the "SalePrice" column 
X_kaggle["SalePrice"] = model.predict(X_kaggle)

In [43]:
X_kaggle.head(3)

Unnamed: 0_level_0,Overall Qual,Lot Area,Street_Grvl,Street_Pave,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2658,6,9142,0,1,174100.955171
2718,5,9662,0,1,133748.869424
2414,7,17104,0,1,242389.032685


In [45]:
# Cleaning kaggle submission
X_kaggle = X_kaggle[["SalePrice"]]
X_kaggle.head(2)

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,174100.955171
2718,133748.869424


In [46]:
X_kaggle.to_csv("./my_first_submission.csv")