# Imports

We will begin by setting out Imports and reading in our dataset 

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.linear_model import LinearRegression , LassoCV , RidgeCV

In [2]:
df = pd.read_csv('../datasets/train.csv')

In [3]:
# lets make the names pythonic
df.columns = df.columns.str.replace(" ","_")

In [4]:
df.columns

Index(['Id', 'PID', 'MS_SubClass', 'MS_Zoning', 'Lot_Frontage', 'Lot_Area',
       'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
       'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1',
       'Condition_2', 'Bldg_Type', 'House_Style', 'Overall_Qual',
       'Overall_Cond', 'Year_Built', 'Year_Remod/Add', 'Roof_Style',
       'Roof_Matl', 'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type',
       'Mas_Vnr_Area', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
       'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_SF_1',
       'BsmtFin_Type_2', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF',
       'Heating', 'Heating_QC', 'Central_Air', 'Electrical', '1st_Flr_SF',
       '2nd_Flr_SF', 'Low_Qual_Fin_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath',
       'Bsmt_Half_Bath', 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr',
       'Kitchen_AbvGr', 'Kitchen_Qual', 'TotRms_AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace_Qu', 'Garage_Type', 'Garage_Yr_Blt',
       'G

# CLEANING

Looking at the size of our dataset we elected to narrow the feature our model incorporates down to those we feel will be most predictive. We can come back here latter to add new feature from our original DF to our working copy in the new_df.

In [5]:
new_df = df[['Lot_Area','1st_Flr_SF','2nd_Flr_SF',
             'Total_Bsmt_SF',
             'Central_Air','SalePrice','Garage_Area',
             'BsmtFin_SF_2','BsmtFin_SF_1']].copy()

Now we need to Create a column for Sq_ft from the 3 columns we have pertaining to that value.

In [6]:
def look_unique(df):
    for col in df.columns:
        print(df[f'{col}'].unique())
        
# look_unique(new_df)

In [7]:
# Creating a feature for Total Sq feet 

new_df['Sq_ft'] = new_df['1st_Flr_SF'] + new_df['2nd_Flr_SF'] + new_df['Total_Bsmt_SF']

In [8]:
# Dropping the Original columns used to make Sq_ft

new_df.drop(columns = ['1st_Flr_SF','2nd_Flr_SF','Total_Bsmt_SF'], inplace = True)
new_df.head()

Unnamed: 0,Lot_Area,Central_Air,SalePrice,Garage_Area,BsmtFin_SF_2,BsmtFin_SF_1,Sq_ft
0,13517,Y,130500,475.0,0.0,533.0,2204.0
1,11492,Y,220000,559.0,0.0,637.0,3035.0
2,7922,Y,109000,246.0,0.0,731.0,2114.0
3,9802,Y,174000,400.0,0.0,0.0,1828.0
4,14235,Y,138500,484.0,0.0,0.0,2121.0


In [9]:
# DO NOT RUN ME TWICE


# Making our HasCentral Air a binary column 
new_df.Central_Air = np.where(new_df.Central_Air == 'Y',1,0)

# Remove Outliers From Lot Area 
real_lot_size = new_df.Lot_Area < 100_000
new_df = new_df[real_lot_size]

# Remove outliers from Sq footage 
drop_unreal_sqft = new_df.Sq_ft < 8000
new_df = new_df[drop_unreal_sqft]

In [10]:
# ADD NEW DROPS OR FILTERS HERE 






# Feature Engineering

## Has_Garage

Feature creation for Has_Garage True/False, 0/1

In [11]:
# Lets make a column for Has_Garage
new_df['Has_Garage'] = np.where(new_df.Garage_Area > 0.0 , 1, 0)

# and drop the garage Sq ft column
new_df.drop(columns = ['Garage_Area'], inplace = True)

In [12]:
new_df.groupby(['Has_Garage']).mean()

Unnamed: 0_level_0,Lot_Area,Central_Air,SalePrice,BsmtFin_SF_2,BsmtFin_SF_1,Sq_ft
Has_Garage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7677.166667,0.614035,106065.991228,12.684211,210.894737,1994.412281
1,10033.788302,0.949793,185870.075052,50.139752,450.881988,2575.278468


## Large_House

Now we will classify houses whose Lot Area is Large as those with a Lot Area greater than the 75th percentile.

In [13]:
# Lets make one for houses with large lots
#Large_House = new_df['Lot_Area'] > 12_000 
 
new_df['Large_Lot'] = np.where(new_df['Lot_Area'] > 12_000, 1, 0)

## Finished_Basement

In [14]:
# one for houses with finished basements
# first lets make a column for whether or not a home has a finsihed basement

# we need to ass the two measure of finished basements together 
new_df['fin_base_sqft'] =  new_df['BsmtFin_SF_2'] + new_df['BsmtFin_SF_1']

# make column for if basement if fisnished 
new_df['Finished_Basement'] = np.where(new_df['fin_base_sqft'] > 0.0 , 1,0)



In [15]:
# Drop the two measures of finished basement
new_df.drop(columns = ['BsmtFin_SF_2','BsmtFin_SF_1','fin_base_sqft'],inplace = True)

# Export

In [16]:
new_df.head()

Unnamed: 0,Lot_Area,Central_Air,SalePrice,Sq_ft,Has_Garage,Large_Lot,Finished_Basement
0,13517,1,130500,2204.0,1,1,1
1,11492,1,220000,3035.0,1,0,1
2,7922,1,109000,2114.0,1,0,1
3,9802,1,174000,1828.0,1,0,0
4,14235,1,138500,2121.0,1,1,0


In [17]:
new_df.to_csv('../datasets/Clean_Train.csv', index = False)