In [3]:
#Importing important libraries.
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [4]:
#Load Dataset.
filename = "D:\\4th Computer Engineering\\Labs\\Machine Learning Lab\\train.csv"
data = pd.read_csv(filename)
data.head()   # Show first 5 rows of data.

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#Showing information about data.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

**CLEANING THE DATA**

In [6]:
#Drop unuseful columns.
data = data.drop(['Fence', 'MiscFeature', 'PoolQC', 'FireplaceQu', 'Alley'], axis=1)

#Fill null values with mean.
mean_columns = np.array(['MasVnrArea','GarageYrBlt','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                    'TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageArea','GarageCars', 'LotFrontage'])
for col in mean_columns:
    data[col] = data[col].fillna(data[col].mean())

#Fill null values with mode.
mode_columns = np.array(['MasVnrType','BsmtQual','MSZoning','Utilities','Exterior1st','Exterior2nd',
                         'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical',
                         'KitchenQual','GarageType','GarageFinish','GarageQual','GarageCond','Functional','SaleType'])
for col_1 in mode_columns:
    data[col_1] = data[col_1].fillna(data[col_1].mode()[0])

#Split the features and the target.
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

#Transform categorical columns.
cat_col = np.array(['MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope',
                    'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',
                    'Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual',
                    'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir',
                    'Electrical','KitchenQual','Functional','GarageType','GarageFinish','GarageQual','GarageCond',
                    'PavedDrive','SaleType','SaleCondition'])
enc = OrdinalEncoder()
X[cat_col] = enc.fit_transform(data[cat_col])


In [7]:
#Show features
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3.0,65.0,8450,1.0,3.0,3.0,0.0,4.0,0.0,...,61,0,0,0,0,0,2,2008,8.0,4.0
1,20,3.0,80.0,9600,1.0,3.0,3.0,0.0,2.0,0.0,...,0,0,0,0,0,0,5,2007,8.0,4.0
2,60,3.0,68.0,11250,1.0,0.0,3.0,0.0,4.0,0.0,...,42,0,0,0,0,0,9,2008,8.0,4.0
3,70,3.0,60.0,9550,1.0,0.0,3.0,0.0,0.0,0.0,...,35,272,0,0,0,0,2,2006,8.0,0.0
4,60,3.0,84.0,14260,1.0,0.0,3.0,0.0,2.0,0.0,...,84,0,0,0,0,0,12,2008,8.0,4.0


In [8]:
#Show info about the features to make sure it is cleaned.
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   float64
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   float64
 5   LotShape       1460 non-null   float64
 6   LandContour    1460 non-null   float64
 7   Utilities      1460 non-null   float64
 8   LotConfig      1460 non-null   float64
 9   LandSlope      1460 non-null   float64
 10  Neighborhood   1460 non-null   float64
 11  Condition1     1460 non-null   float64
 12  Condition2     1460 non-null   float64
 13  BldgType       1460 non-null   float64
 14  HouseStyle     1460 non-null   float64
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [9]:
#Show the target.
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

**Making the model**

In [10]:
#Split the data. 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#Make the model.
regressor = DecisionTreeRegressor(max_depth=13, random_state=0)
regressor.fit(X_train, y_train)

#Predict.
predictions = regressor.predict(X_test)

#Performance.
mae = mean_absolute_error(y_test, predictions)
print(f"Error = {mae}")

Error = 24878.112062055592


In [11]:
print(predictions[0:5])
print(y_test[0:5])

[237500.   146500.   131006.25 225500.   118750.  ]
529    200624
491    133000
459    110000
279    192000
655     88000
Name: SalePrice, dtype: int64


  print(y_test[0:5])
