# House prices prediction
Goal: Dự đoán giá từng căn nhà


### Step 1: Import libraries and read data

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [41]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Step 2: Xử lý các giá trị 'Null'

In [42]:
nulls = train_data.isnull().sum().sort_values(ascending=False)
nulls.head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
MasVnrType         8
Electrical         1
Id                 0
dtype: int64

Những giá trị có hơn 60% giá trị null nên loại bỏ. Vì chúng thật sự không ảnh hưởng đến giá nhà. Đồng thời bỏ luôn cột 'Id'.

In [43]:
train_data = train_data.drop(['Id','Alley','PoolQC','Fence','MiscFeature'],axis=1)

Xử lý giá trị 'FireplaceQu'

In [44]:
train_data[['Fireplaces','FireplaceQu']].head(10)

Unnamed: 0,Fireplaces,FireplaceQu
0,0,
1,1,TA
2,1,TA
3,1,Gd
4,1,TA
5,0,
6,1,Gd
7,2,TA
8,2,TA
9,2,TA


In [45]:
train_data[['Fireplaces']].value_counts()

Fireplaces
0             690
1             650
2             115
3               5
dtype: int64

In [46]:
train_data[['FireplaceQu']].isnull().sum()

FireplaceQu    690
dtype: int64

Những giá trị bằng 0 ở cột 'Fireplaces' thì 'FireplaceQu' mới bằng null. vì thế ta thay thế giá trị null bằnh 'NF' : noFireplace

In [47]:
train_data['FireplaceQu']=train_data['FireplaceQu'].fillna('NF')

Xử lí giá trị 'LotFrontage'

In [48]:
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(value= train_data['LotFrontage'].mean())

Xử lý các giá trị liên quan đên GARAGE

In [49]:
train_data['GarageYrBlt'].isnull().sum()

81

In [50]:
train_data['GarageCond'].isnull().sum()

81

In [51]:
train_data['GarageType'].isnull().sum()

81

In [52]:
train_data['GarageFinish'].isnull().sum()

81

In [53]:
train_data['GarageQual'].isnull().sum()

81

In [54]:
train_data['GarageArea'].value_counts().head()

0      81
440    49
576    47
240    38
484    34
Name: GarageArea, dtype: int64

Dễ thấy các giá trị bằng 0 trong 'GarageArea' tương ứng với các giá trị null trong các cột liên quan. vì thế có thể thay các null bằng 'NG' : noGarage    

In [55]:
train_data['GarageYrBlt']=train_data['GarageYrBlt'].fillna('NG')
train_data['GarageCond']=train_data['GarageCond'].fillna('NG')
train_data['GarageType']=train_data['GarageType'].fillna('NG')
train_data['GarageFinish']=train_data['GarageFinish'].fillna('NG')
train_data['GarageQual']=train_data['GarageQual'].fillna('NG')

Xử lý tương tự với Bmst, MasVnr và Electrical

In [56]:
train_data['BsmtExposure']=train_data['BsmtExposure'].fillna('NB')
train_data['BsmtFinType2']=train_data['BsmtFinType2'].fillna('NB')
train_data['BsmtQual']=train_data['BsmtQual'].fillna('NB')
train_data['BsmtCond']=train_data['BsmtCond'].fillna('NB')
train_data['BsmtFinType1']=train_data['BsmtFinType1'].fillna('NB')

In [57]:
train_data['MasVnrArea']=train_data['MasVnrArea'].fillna(value= train_data['MasVnrArea'].mean())
train_data['MasVnrType']=train_data['MasVnrType'].fillna('None')

In [58]:
train_data['Electrical'] = train_data.Electrical.fillna('Sbrkr')

In [59]:
train_data.isnull().sum().sum()

0

CÁC GIÁ TRỊ NULL ĐÃ ĐƯỢC XỬ LÝ!


### Step 3: Ngoại lệ

In [60]:
num_train = train_data._get_numeric_data()

In [61]:
num_train.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [62]:
train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [63]:
train_data.columns[train_data.isna().any()]

Index([], dtype='object')

In [64]:
train_data.shape

(1460, 76)

In [65]:
from sklearn.preprocessing import StandardScaler

In [66]:
train_d = pd.get_dummies(train_data)
train_d1 = train_d.drop(['SalePrice'],axis = 1)
y = train_d.SalePrice

In [67]:
scaler = StandardScaler()
scaler.fit(train_d1)                
t_train = scaler.transform(train_d1)

In [68]:
from sklearn.decomposition import PCA

In [69]:
pca_hp = PCA(30)
x_fit = pca_hp.fit_transform(t_train)

In [70]:
x_fit.shape

(1460, 30)

In [71]:
np.exp(pca_hp.explained_variance_ratio_)

array([1.05153739, 1.02424121, 1.01989567, 1.0183023 , 1.01683723,
       1.01500221, 1.01245813, 1.012197  , 1.00985812, 1.00966757,
       1.00936888, 1.00864552, 1.00821282, 1.00807378, 1.00789143,
       1.00767939, 1.00745021, 1.00727362, 1.00721682, 1.00695803,
       1.00680092, 1.00677842, 1.00657153, 1.00646554, 1.00626333,
       1.00622673, 1.00620978, 1.00610118, 1.00594264, 1.00580408])

In [72]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_fit,y,test_size=0.30,random_state=100) 

In [73]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1022, 30)
(1022,)
(438, 30)
(438,)


### Step 4: Huấn luyện bằng Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define a random forest model
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(x_train, y_train)

In [None]:
rf_val_predictions = rf_model.predict(x_test)

In [None]:
result = pd.DataFrame({'y': y_test, 'y_predic': rf_val_predictions})

In [None]:
result.to_csv('submission.csv',index=False)