# Data Understanding

## Import the modules

In [197]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
plt.style.use("ggplot")

## Import the dataset

In [198]:
df = pd.read_csv("train.csv", keep_default_na=False, low_memory=False)

## How many rows and columns?

In [199]:
df.shape

(1460, 81)

## How many elements?

In [200]:
df.size

118260

## The attributes

In [201]:
print(df.columns.tolist())

['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC'

## The first ten rows

In [202]:
df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


## How many NA where meaning is not specified?

In [203]:

NA_specified = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]

NA_specified.append("GarageYrBlt") #Not specified but likely has the same meaning as for GarageType (No garage)

for column in df.columns.tolist():

    if column not in NA_specified:
        num = (df[column] == "NA").sum()
        
        if num != 0:
            print(f"{column}: {num}")

LotFrontage: 259
MasVnrType: 8
MasVnrArea: 8
Electrical: 1


## Remove NA where meaning is not specified

In [204]:
df.drop(columns=["LotFrontage"], inplace=True) #Drop column instead of records because of high percentage of null values

df = df[df["MasVnrArea"] != "NA"]
df = df[df["MasVnrArea"] != "NA"]
df = df[df["Electrical"] != "NA"]

df.shape

(1451, 80)

## Discretize class label

In [205]:
df.loc[df["SalePrice"] <= 150000, "SalePriceDisc"] = "LOW"
df.loc[(df["SalePrice"] > 150000) & (df["SalePrice"] < 300000), "SalePriceDisc"] = "MEDIUM"
df.loc[df["SalePrice"] >= 300000, "SalePriceDisc"] = "HIGH"

## Remove old class label

In [206]:
df.drop(columns=["SalePrice"], inplace=True)

## Get attribute types

In [207]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1451 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1451 non-null   int64 
 1   MSSubClass     1451 non-null   int64 
 2   MSZoning       1451 non-null   object
 3   LotArea        1451 non-null   int64 
 4   Street         1451 non-null   object
 5   Alley          1451 non-null   object
 6   LotShape       1451 non-null   object
 7   LandContour    1451 non-null   object
 8   Utilities      1451 non-null   object
 9   LotConfig      1451 non-null   object
 10  LandSlope      1451 non-null   object
 11  Neighborhood   1451 non-null   object
 12  Condition1     1451 non-null   object
 13  Condition2     1451 non-null   object
 14  BldgType       1451 non-null   object
 15  HouseStyle     1451 non-null   object
 16  OverallQual    1451 non-null   int64 
 17  OverallCond    1451 non-null   int64 
 18  YearBuilt      1451 non-null   in

## Convert types

In [208]:
to_category = ["MSSubClass", "OverallQual", "OverallCond", "MoSold"]

for column in df.columns:
    if df[column].dtype == object or column in to_category:
        df[column] = df[column].astype("category")
    
df["MasVnrArea"] = df["MasVnrArea"].astype("int64")

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1451 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             1451 non-null   int64   
 1   MSSubClass     1451 non-null   category
 2   MSZoning       1451 non-null   category
 3   LotArea        1451 non-null   int64   
 4   Street         1451 non-null   category
 5   Alley          1451 non-null   category
 6   LotShape       1451 non-null   category
 7   LandContour    1451 non-null   category
 8   Utilities      1451 non-null   category
 9   LotConfig      1451 non-null   category
 10  LandSlope      1451 non-null   category
 11  Neighborhood   1451 non-null   category
 12  Condition1     1451 non-null   category
 13  Condition2     1451 non-null   category
 14  BldgType       1451 non-null   category
 15  HouseStyle     1451 non-null   category
 16  OverallQual    1451 non-null   category
 17  OverallCond    1451 non-null   categor

## Should GarageYrBlt be discretized or removed?

In [209]:
# sb.countplot(x = df["GarageYrBlt"], )

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePriceDisc
0,1,60,RL,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,MEDIUM
1,2,20,RL,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,MEDIUM
2,3,60,RL,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,MEDIUM
3,4,70,RL,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,LOW
4,5,60,RL,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,MEDIUM
5,6,50,RL,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,LOW
6,7,20,RL,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,HIGH
7,8,60,RL,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,MEDIUM
8,9,50,RM,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,LOW
9,10,190,RL,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,LOW


# Data Preparation

# Modeling

# Evaluation