
# Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 

# Loading Data

In [2]:
with open('data_description.txt') as f:
    contents = f.read()
    print(contents)

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

In [3]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [6]:
print('Train:{}   Test:{}'.format(train_data.shape,test_data.shape))

Train:(1460, 81)   Test:(1459, 80)


# Analysis

In [7]:
# import pandas_profiling as pp
# pp.ProfileReport(train_data) 

<h2>Data fields</h2>
<p>Here's a brief version of what you'll find in the data description file.</p>
<ul>
<li><strong>SalePrice</strong> - the property's sale price in dollars. This is the target variable that you're trying to predict.</li>
<li><strong>MSSubClass</strong>: The building class</li>
<li><strong>MSZoning</strong>: The general zoning classification</li>
<li><strong>LotFrontage</strong>: Linear feet of street connected to property</li>
<li><strong>LotArea</strong>: Lot size in square feet</li>
<li><strong>Street</strong>: Type of road access</li>
<li><strong>Alley</strong>: Type of alley access</li>
<li><strong>LotShape</strong>: General shape of property</li>
<li><strong>LandContour</strong>: Flatness of the property</li>
<li><strong>Utilities</strong>: Type of utilities available</li>
<li><strong>LotConfig</strong>: Lot configuration</li>
<li><strong>LandSlope</strong>: Slope of property</li>
<li><strong>Neighborhood</strong>: Physical locations within Ames city limits</li>
<li><strong>Condition1</strong>: Proximity to main road or railroad</li>
<li><strong>Condition2</strong>: Proximity to main road or railroad (if a second is present)</li>
<li><strong>BldgType</strong>: Type of dwelling</li>
<li><strong>HouseStyle</strong>: Style of dwelling</li>
<li><strong>OverallQual</strong>: Overall material and finish quality</li>
<li><strong>OverallCond</strong>: Overall condition rating</li>
<li><strong>YearBuilt</strong>: Original construction date</li>
<li><strong>YearRemodAdd</strong>: Remodel date</li>
<li><strong>RoofStyle</strong>: Type of roof</li>
<li><strong>RoofMatl</strong>: Roof material</li>
<li><strong>Exterior1st</strong>: Exterior covering on house</li>
<li><strong>Exterior2nd</strong>: Exterior covering on house (if more than one material)</li>
<li><strong>MasVnrType</strong>: Masonry veneer type</li>
<li><strong>MasVnrArea</strong>: Masonry veneer area in square feet</li>
<li><strong>ExterQual</strong>: Exterior material quality</li>
<li><strong>ExterCond</strong>: Present condition of the material on the exterior</li>
<li><strong>Foundation</strong>: Type of foundation</li>
<li><strong>BsmtQual</strong>: Height of the basement</li>
<li><strong>BsmtCond</strong>: General condition of the basement</li>
<li><strong>BsmtExposure</strong>: Walkout or garden level basement walls</li>
<li><strong>BsmtFinType1</strong>: Quality of basement finished area</li>
<li><strong>BsmtFinSF1</strong>: Type 1 finished square feet</li>
<li><strong>BsmtFinType2</strong>: Quality of second finished area (if present)</li>
<li><strong>BsmtFinSF2</strong>: Type 2 finished square feet</li>
<li><strong>BsmtUnfSF</strong>: Unfinished square feet of basement area</li>
<li><strong>TotalBsmtSF</strong>: Total square feet of basement area</li>
<li><strong>Heating</strong>: Type of heating</li>
<li><strong>HeatingQC</strong>: Heating quality and condition</li>
<li><strong>CentralAir</strong>: Central air conditioning</li>
<li><strong>Electrical</strong>: Electrical system</li>
<li><strong>1stFlrSF</strong>: First Floor square feet</li>
<li><strong>2ndFlrSF</strong>: Second floor square feet</li>
<li><strong>LowQualFinSF</strong>: Low quality finished square feet (all floors)</li>
<li><strong>GrLivArea</strong>: Above grade (ground) living area square feet</li>
<li><strong>BsmtFullBath</strong>: Basement full bathrooms</li>
<li><strong>BsmtHalfBath</strong>: Basement half bathrooms</li>
<li><strong>FullBath</strong>: Full bathrooms above grade</li>
<li><strong>HalfBath</strong>: Half baths above grade</li>
<li><strong>Bedroom</strong>: Number of bedrooms above basement level</li>
<li><strong>Kitchen</strong>: Number of kitchens</li>
<li><strong>KitchenQual</strong>: Kitchen quality</li>
<li><strong>TotRmsAbvGrd</strong>: Total rooms above grade (does not include bathrooms)</li>
<li><strong>Functional</strong>: Home functionality rating</li>
<li><strong>Fireplaces</strong>: Number of fireplaces</li>
<li><strong>FireplaceQu</strong>: Fireplace quality</li>
<li><strong>GarageType</strong>: Garage location</li>
<li><strong>GarageYrBlt</strong>: Year garage was built</li>
<li><strong>GarageFinish</strong>: Interior finish of the garage</li>
<li><strong>GarageCars</strong>: Size of garage in car capacity</li>
<li><strong>GarageArea</strong>: Size of garage in square feet</li>
<li><strong>GarageQual</strong>: Garage quality</li>
<li><strong>GarageCond</strong>: Garage condition</li>
<li><strong>PavedDrive</strong>: Paved driveway</li>
<li><strong>WoodDeckSF</strong>: Wood deck area in square feet</li>
<li><strong>OpenPorchSF</strong>: Open porch area in square feet</li>
<li><strong>EnclosedPorch</strong>: Enclosed porch area in square feet</li>
<li><strong>3SsnPorch</strong>: Three season porch area in square feet</li>
<li><strong>ScreenPorch</strong>: Screen porch area in square feet</li>
<li><strong>PoolArea</strong>: Pool area in square feet</li>
<li><strong>PoolQC</strong>: Pool quality</li>
<li><strong>Fence</strong>: Fence quality</li>
<li><strong>MiscFeature</strong>: Miscellaneous feature not covered in other categories</li>
<li><strong>MiscVal</strong>: $Value of miscellaneous feature</li>
<li><strong>MoSold</strong>: Month Sold</li>
<li><strong>YrSold</strong>: Year Sold</li>
<li><strong>SaleType</strong>: Type of sale</li>
<li><strong>SaleCondition</strong>: Condition of sale</li>
</ul>

# Feature Selection

### Removing unnecessary columns

In [8]:
#Dropping the unecessary columns
removeCols = ['Id','PoolQC','MiscFeature','Alley','Fence']

for col in removeCols:
    train_data=train_data.drop(col,axis=1)
    if col !='Id':
        test_data=test_data.drop(col,axis=1)
    print(col," removed")

Id  removed
PoolQC  removed
MiscFeature  removed
Alley  removed
Fence  removed


# Handling Null Values

In [9]:
def missing_data(data):
    # Count number of missing value in a column
    total = data.isnull().sum()           
    
    # Get Percentage of missing values
    percent = (data.isnull().sum()/data.isnull().count()*100)   
    temp = pd.concat([total, percent], axis=1, keys=['Total', 'Percent(%)'])

    # Create a Type column, that indicates the data-type of the column.
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    temp['Types'] = types

    return(np.transpose(temp))


In [10]:
missing_data(train_data)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Total,0,0,259,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Percent(%),0,0,17.7397,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Types,int64,object,float64,int64,object,object,object,object,object,object,...,int64,int64,int64,int64,int64,int64,int64,object,object,int64


In [11]:
missing_data(test_data)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Total,0,0,4,227,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,1,0
Percent(%),0,0,0.27416,15.5586,0,0,0,0,0.13708,0,...,0,0,0,0,0,0,0,0,0.0685401,0
Types,int64,int64,object,float64,int64,object,object,object,object,object,...,int64,int64,int64,int64,int64,int64,int64,int64,object,object


In [12]:
# https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard
all_data = train_data
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.to_string())

              Missing Ratio
FireplaceQu       47.260274
LotFrontage       17.739726
GarageCond         5.547945
GarageQual         5.547945
GarageFinish       5.547945
GarageYrBlt        5.547945
GarageType         5.547945
BsmtFinType2       2.602740
BsmtExposure       2.602740
BsmtFinType1       2.534247
BsmtCond           2.534247
BsmtQual           2.534247
MasVnrArea         0.547945
MasVnrType         0.547945
Electrical         0.068493


In [13]:
all_data = test_data
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.to_string())

              Missing Ratio
FireplaceQu       50.034270
LotFrontage       15.558602
GarageCond         5.346127
GarageQual         5.346127
GarageFinish       5.346127
GarageYrBlt        5.346127
GarageType         5.209047
BsmtCond           3.084304
BsmtExposure       3.015764
BsmtQual           3.015764
BsmtFinType1       2.878684
BsmtFinType2       2.878684
MasVnrType         1.096642
MasVnrArea         1.028101
MSZoning           0.274160
BsmtFullBath       0.137080
BsmtHalfBath       0.137080
Functional         0.137080
Utilities          0.137080
Exterior2nd        0.068540
Exterior1st        0.068540
SaleType           0.068540
BsmtFinSF1         0.068540
BsmtFinSF2         0.068540
TotalBsmtSF        0.068540
KitchenQual        0.068540
GarageCars         0.068540
GarageArea         0.068540
BsmtUnfSF          0.068540


In [14]:
train_data['BsmtExposure'].value_counts()

No    953
Av    221
Gd    134
Mn    114
Name: BsmtExposure, dtype: int64

In [15]:
#Taking care of the missing data
train_data["FireplaceQu"].fillna("None", inplace=True)
train_data["GarageCond"].fillna("None", inplace=True)
train_data["GarageQual"].fillna("None", inplace=True)
train_data["GarageFinish"].fillna("None", inplace=True)
train_data["GarageYrBlt"].fillna("None", inplace=True)
train_data["GarageType"].fillna("None", inplace=True)
train_data["BsmtCond"].fillna("None", inplace=True)
train_data["BsmtExposure"].fillna("None", inplace=True)
train_data["BsmtQual"].fillna("None", inplace=True)
train_data["BsmtFinType1"].fillna("None", inplace=True)
train_data["BsmtFinType2"].fillna("None", inplace=True)
train_data["MasVnrType"].fillna("None", inplace=True)
train_data["MSZoning"].fillna("None", inplace=True)

train_data["Functional"].fillna("None", inplace=True)
train_data["Utilities"].fillna("None", inplace=True)
train_data["Exterior2nd"].fillna("None", inplace=True)
train_data["Exterior1st"].fillna("None", inplace=True)
train_data["SaleType"].fillna("None", inplace=True)
train_data["FireplaceQu"].fillna("None", inplace=True)
train_data["KitchenQual"].fillna("None", inplace=True)

train_data["MasVnrArea"].fillna(0, inplace=True)
train_data["BsmtFullBath"].fillna(0, inplace=True)
train_data["BsmtHalfBath"].fillna(0, inplace=True)
train_data["BsmtFinSF1"].fillna(0, inplace=True)
train_data["BsmtFinSF2"].fillna(0, inplace=True)
train_data["TotalBsmtSF"].fillna(0, inplace=True)
train_data["BsmtHalfBath"].fillna(0, inplace=True)
train_data["BsmtHalfBath"].fillna(0, inplace=True)
train_data["GarageCars"].fillna(0, inplace=True)
train_data["GarageArea"].fillna(0, inplace=True)
train_data["BsmtUnfSF"].fillna(0, inplace=True)


train_data["LotFrontage"].fillna(train_data['LotFrontage'].mean(), inplace=True)
train_data["Electrical"].fillna(train_data["Electrical"].mode()[0], inplace=True)

In [16]:
# https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard
all_data = train_data
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.to_string())

Empty DataFrame
Columns: [Missing Ratio]
Index: []


In [17]:
for col in ["Electrical","BsmtQual", "BsmtFinType1", "BsmtFinType2", "MasVnrType", "MasVnrArea", "MSZoning", "BsmtFullBath", "BsmtHalfBath", "Functional", "Utilities", "Exterior2nd", "Exterior1st", "SaleType", "BsmtFinSF1", "BsmtFinSF2", "TotalBsmtSF", "KitchenQual", "GarageCars", "GarageArea", "BsmtUnfSF"]:
    print(col)
    print(train_data[col].value_counts())
    print("-"*25)

Electrical
SBrkr    1335
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
-------------------------
BsmtQual
TA      649
Gd      618
Ex      121
None     37
Fa       35
Name: BsmtQual, dtype: int64
-------------------------
BsmtFinType1
Unf     430
GLQ     418
ALQ     220
BLQ     148
Rec     133
LwQ      74
None     37
Name: BsmtFinType1, dtype: int64
-------------------------
BsmtFinType2
Unf     1256
Rec       54
LwQ       46
None      38
BLQ       33
ALQ       19
GLQ       14
Name: BsmtFinType2, dtype: int64
-------------------------
MasVnrType
None       872
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64
-------------------------
MasVnrArea
0.0      869
72.0       8
180.0      8
108.0      8
120.0      7
        ... 
651.0      1
337.0      1
415.0      1
293.0      1
621.0      1
Name: MasVnrArea, Length: 327, dtype: int64
-------------------------
MSZoning
RL         1151
RM          218
FV           65
RH        

In [18]:
all_data = test_data
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.to_string())

              Missing Ratio
FireplaceQu       50.034270
LotFrontage       15.558602
GarageCond         5.346127
GarageQual         5.346127
GarageFinish       5.346127
GarageYrBlt        5.346127
GarageType         5.209047
BsmtCond           3.084304
BsmtExposure       3.015764
BsmtQual           3.015764
BsmtFinType1       2.878684
BsmtFinType2       2.878684
MasVnrType         1.096642
MasVnrArea         1.028101
MSZoning           0.274160
BsmtFullBath       0.137080
BsmtHalfBath       0.137080
Functional         0.137080
Utilities          0.137080
Exterior2nd        0.068540
Exterior1st        0.068540
SaleType           0.068540
BsmtFinSF1         0.068540
BsmtFinSF2         0.068540
TotalBsmtSF        0.068540
KitchenQual        0.068540
GarageCars         0.068540
GarageArea         0.068540
BsmtUnfSF          0.068540


# Feature Transformation

# X y Train Test Split

# Feature Scaling

# Model Training

# Score

# Ensemble Top 3 Models

# Submission

# Archive

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/code?competitionId=5407&sortBy=voteCount
https://www.kaggle.com/code/joymukerjee010/joymukerjee-housingprediction
https://www.kaggle.com/code/pmarcelino/comprehensive-data-exploration-with-python
https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard
https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data