In [1]:
import pandas as pd

#### Data Description
LotFrontage: Linear feet of street connected to property

LotArea: Lot size in square feet

TotalBsmtSF: Total square feet of basement area

BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)

Fireplaces: Number of fireplaces

PoolArea: Pool area in square feet

GarageCars: Size of garage in car capacity

WoodDeckSF: Wood deck area in square feet

ScreenPorch: Screen porch area in square feet

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM	Residential Medium Density

Condition1: Proximity to various conditions
	
       Artery	Adjacent to arterial street
       Feedr	Adjacent to feeder street	
       Norm	Normal	
       RRNn	Within 200' of North-South Railroad
       RRAn	Adjacent to North-South Railroad
       PosN	Near positive off-site feature--park, greenbelt, etc.
       PosA	Adjacent to postive off-site feature
       RRNe	Within 200' of East-West Railroad
       RRAe	Adjacent to East-West Railroad

Heating: Type of heating
		
       Floor	Floor Furnace
       GasA	Gas forced warm air furnace
       GasW	Gas hot water or steam heat
       Grav	Gravity furnace	
       OthW	Hot water or steam heat other than gas
       Wall	Wall furnace

Street: Type of road access to property

       Grvl	Gravel	
       Pave	Paved

CentralAir: Central air conditioning

       N	No
       Y	Yes

Foundation: Type of foundation
		
       BrkTil	Brick & Tile
       CBlock	Cinder Block
       PConc	Poured Contrete	
       Slab	Slab
       Stone	Stone
       Wood	Wood

ExterQual: Evaluates the quality of the material on the exterior 
		
       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       Po	Poor
		
ExterCond: Evaluates the present condition of the material on the exterior
		
       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       Po	Poor

BsmtQual: Evaluates the height of the basement

       Ex	Excellent (100+ inches)	
       Gd	Good (90-99 inches)
       TA	Typical (80-89 inches)
       Fa	Fair (70-79 inches)
       Po	Poor (<70 inches
       NA	No Basement
		
BsmtCond: Evaluates the general condition of the basement

       Ex	Excellent
       Gd	Good
       TA	Typical - slight dampness allowed
       Fa	Fair - dampness or some cracking or settling
       Po	Poor - Severe cracking, settling, or wetness
       NA	No Basement
	
BsmtExposure: Refers to walkout or garden level walls

       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
	
BsmtFinType1: Rating of basement finished area

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement

KitchenQual: Kitchen quality

       Ex	Excellent
       Gd	Good
       TA	Typical/Average
       Fa	Fair
       Po	Poor

FireplaceQu: Fireplace quality

       Ex	Excellent - Exceptional Masonry Fireplace
       Gd	Good - Masonry Fireplace in main level
       TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
       Fa	Fair - Prefabricated Fireplace in basement
       Po	Poor - Ben Franklin Stove
       NA	No Fireplace

In [2]:
url = '/Users/sadiakhanrupa/Bootcamp Main Phase/Chapter_7 Supervised_ML/Data/housing_iteration_4_classification/housing_iteration_4_classification.csv'

In [3]:
data = pd.read_csv(url)

In [4]:
data.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
0,8450,65.0,856,3,0,0,2,0,0,0,...,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Y,CBlock,TA,TA,Gd,TA,Gd,ALQ,TA,TA
2,11250,68.0,920,3,1,0,2,0,0,0,...,Y,PConc,Gd,TA,Gd,TA,Mn,GLQ,Gd,TA
3,9550,60.0,756,3,1,0,3,0,0,0,...,Y,BrkTil,TA,TA,TA,Gd,No,ALQ,Gd,Gd
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Y,PConc,Gd,TA,Gd,TA,Av,GLQ,Gd,TA


## Split X and y


In [5]:
y = data.pop('Expensive')

In [6]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Expensive, dtype: int64

In [7]:
X = data.copy()

In [8]:
X.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
0,8450,65.0,856,3,0,0,2,0,0,RL,...,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,
1,9600,80.0,1262,3,1,0,2,298,0,RL,...,Y,CBlock,TA,TA,Gd,TA,Gd,ALQ,TA,TA
2,11250,68.0,920,3,1,0,2,0,0,RL,...,Y,PConc,Gd,TA,Gd,TA,Mn,GLQ,Gd,TA
3,9550,60.0,756,3,1,0,3,0,0,RL,...,Y,BrkTil,TA,TA,TA,Gd,No,ALQ,Gd,Gd
4,14260,84.0,1145,4,1,0,3,192,0,RL,...,Y,PConc,Gd,TA,Gd,TA,Av,GLQ,Gd,TA


## Splitting into Train and test dataset

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [11]:
X_train

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
254,8400,70.0,1314,3,0,0,1,250,0,RL,...,Y,CBlock,TA,Gd,TA,TA,No,Rec,TA,
1066,7837,59.0,799,3,1,0,2,0,0,RL,...,Y,PConc,Gd,TA,Gd,TA,No,Unf,TA,TA
638,8777,67.0,796,2,0,0,0,328,0,RL,...,Y,CBlock,TA,TA,Fa,TA,No,Unf,TA,
799,7200,60.0,731,3,2,0,1,0,0,RL,...,Y,BrkTil,TA,TA,Gd,TA,No,ALQ,Gd,TA
380,5000,50.0,1026,3,1,0,1,0,0,RL,...,Y,BrkTil,TA,TA,TA,TA,No,LwQ,Gd,Gd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,9317,78.0,1314,3,1,0,2,0,0,RL,...,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,Gd
1130,7804,65.0,1122,4,2,0,2,431,0,RL,...,Y,BrkTil,TA,TA,TA,TA,No,BLQ,Gd,TA
1294,8172,60.0,864,2,0,0,2,0,0,RL,...,Y,CBlock,TA,TA,TA,TA,No,Rec,TA,
860,7642,55.0,912,3,1,0,1,0,0,RL,...,Y,BrkTil,Gd,TA,TA,TA,No,Unf,Gd,Gd


## Categorical Encoding Manual Approach (without using pipelines)

### 2.1. Replacing NaNs

We will need two different strategies to deal with missing values in numerical and categorical features.

#### 2.1.1. Replacing NaNs in categorical features

We were imputing the mean to NaN’s on our preprocessing pipeline for numerical features. There's a problem with categorical values: they don’t have a “mean”. Here, we will replace NaNs with a string that marks them: “N_A”. It is not an elegant solution, but it will allow us to move forward.

In [12]:
from  sklearn.impute import SimpleImputer
#selecting non numerical values
X_train_cat = X_train.select_dtypes(exclude='number')


#defining the imputer to use 'N_A' as replacement value
cat_imputer = SimpleImputer(strategy='constant',
                            fill_value='N_A').set_output(transform='pandas')


Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
254,RL,Norm,GasA,Pave,Y,CBlock,TA,Gd,TA,TA,No,Rec,TA,
1066,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,No,Unf,TA,TA
638,RL,Feedr,GasA,Pave,Y,CBlock,TA,TA,Fa,TA,No,Unf,TA,
799,RL,Feedr,GasA,Pave,Y,BrkTil,TA,TA,Gd,TA,No,ALQ,Gd,TA
380,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,TA,No,LwQ,Gd,Gd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,Gd
1130,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,TA,No,BLQ,Gd,TA
1294,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,No,Rec,TA,
860,RL,Norm,GasA,Pave,Y,BrkTil,Gd,TA,TA,TA,No,Unf,Gd,Gd
