In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [103]:
# read the data and store data in DataFrame titled house_data
house_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

#print a summary of the data in Melbourne data
print(house_data.shape)
print(house_data.info())

#does not need Id column either
house_data = house_data.drop('Id',axis=1)
test_id = test_data['Id']
test_data = test_data.drop('Id',axis=1)

print(house_data.shape, test_data.shape)
house_data.head()

(1460, 81)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 1

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# Handling missing values

In [93]:
#handling missing values
#drop the columns with missing values exceeding 30%
missing_threshold = 0.3
house_data = house_data.dropna(thresh=missing_threshold*house_data.shape[0],axis=1)
print(house_data.shape)
#select the columns in the test data
test_data= test_data[house_data.columns.drop('SalePrice')]
print(test_data.shape)

(1460, 76)
(1459, 75)


In [94]:
#fill missing values with mean or mode
for col in house_data.columns:
    if col == 'SalePrice': #skip the target column
        continue
    if house_data[col].dtype == 'object':
        house_data[col].fillna(house_data[col].mode()[0],inplace=True)
        test_data[col].fillna(test_data[col].mode()[0],inplace=True)
    else:
        house_data[col].fillna(house_data[col].mean(),inplace=True)
        if col in test_data.columns: #only fill in when the column exist in the test data
            test_data[col].fillna(test_data[col].mean(),inplace=True)
print(house_data.shape,test_data.shape)

(1460, 76) (1459, 75)


# Encode the categorical columns

In [97]:
full_data = pd.concat([house_data,test_data],axis=0)
#encode
full_data = pd.get_dummies(full_data,drop_first=True)

print(full_data.shape)

house_data_new = full_data.iloc[:1460,]
test_data_new = full_data.iloc[1460:,].drop('SalePrice',axis=1)
print(house_data_new.shape,test_data_new.shape)

(2919, 237)
(1460, 237) (1459, 236)


In [98]:
#Selecting The Prediction Target
y=house_data_new.SalePrice
X=house_data_new.drop('SalePrice',axis=1) 
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,0,1,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,0,1,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,0,1,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,0,1,0,0,0,1,0


In [100]:
# Train-test split

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the Model
## decision tree

In [101]:
###   Building the Model
# decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

# Define model. Specify a number for random_state to ensure same results each run
house_model = DecisionTreeRegressor(random_state=1)

# Fit model
house_model.fit(X_train, y_train)

# Predict valid data
predict_valid_price = house_model.predict(X_valid)

#
mean_absolute_error(y_valid, predict_valid_price)

26536.034246575342

In [102]:
#Predict test data
predict_test_house_price = house_model.predict(test_data_new)

In [106]:
#save the results
submission = pd.DataFrame({"Id": test_id, "SalePrice": predict_test_house_price})
submission.set_index("Id", inplace=True)
# Convert the results to a CSV file
submission.to_csv('HousingPricePred_submission.csv')