In [37]:
import numpy as np 
import pandas as pd 
import statsmodels.api as sm 
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [38]:
df_train = pd.read_csv('housing_train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14447 entries, 0 to 14446
Data columns (total 10 columns):
longitude             14447 non-null float64
latitude              14447 non-null float64
housing_median_age    14447 non-null float64
total_rooms           14447 non-null float64
total_bedrooms        14287 non-null float64
population            14447 non-null float64
households            14447 non-null float64
median_income         14447 non-null float64
median_house_value    14447 non-null float64
ocean_proximity       14447 non-null object
dtypes: float64(9), object(1)
memory usage: 1.1+ MB


In [39]:
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-117.83,34.15,20.0,2421.0,306.0,1023.0,298.0,8.0683,451500.0,INLAND
1,-119.85,36.77,27.0,1510.0,344.0,847.0,295.0,2.9315,83200.0,INLAND
2,-122.13,37.67,40.0,1748.0,318.0,914.0,317.0,3.8676,184000.0,NEAR BAY
3,-119.81,34.46,22.0,3488.0,452.0,1479.0,458.0,7.1687,384400.0,NEAR OCEAN
4,-120.47,34.65,32.0,2193.0,430.0,1074.0,377.0,2.3333,130200.0,NEAR OCEAN


In [40]:
df_test = pd.read_csv('housing_test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4128 entries, 0 to 4127
Data columns (total 10 columns):
longitude             4128 non-null float64
latitude              4128 non-null float64
housing_median_age    4128 non-null float64
total_rooms           4128 non-null float64
total_bedrooms        4098 non-null float64
population            4128 non-null float64
households            4128 non-null float64
median_income         4128 non-null float64
median_house_value    4128 non-null float64
ocean_proximity       4128 non-null object
dtypes: float64(9), object(1)
memory usage: 322.6+ KB


In [41]:
df_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.41,37.65,32.0,3436.0,868.0,2583.0,817.0,3.5039,232400.0,NEAR OCEAN
1,-117.91,33.84,25.0,1021.0,252.0,975.0,258.0,3.125,168100.0,<1H OCEAN
2,-118.35,34.08,52.0,1801.0,313.0,714.0,293.0,4.6838,479000.0,<1H OCEAN
3,-121.02,37.61,33.0,1469.0,370.0,1318.0,349.0,1.7104,59000.0,INLAND
4,-117.65,34.1,30.0,1461.0,341.0,1014.0,345.0,2.4667,106000.0,INLAND


In [42]:
df_dev = pd.read_csv('housing_dev.csv')
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2065 entries, 0 to 2064
Data columns (total 10 columns):
longitude             2065 non-null float64
latitude              2065 non-null float64
housing_median_age    2065 non-null float64
total_rooms           2065 non-null float64
total_bedrooms        2048 non-null float64
population            2065 non-null float64
households            2065 non-null float64
median_income         2065 non-null float64
median_house_value    2065 non-null float64
ocean_proximity       2065 non-null object
dtypes: float64(9), object(1)
memory usage: 161.5+ KB


In [43]:
df_dev.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.36,33.92,26.0,3695.0,1144.0,2308.0,1009.0,2.6667,229300.0,<1H OCEAN
1,-118.41,34.19,45.0,1106.0,225.0,595.0,228.0,3.6625,190700.0,<1H OCEAN
2,-122.48,37.73,38.0,3195.0,828.0,2410.0,778.0,3.1359,350000.0,NEAR OCEAN
3,-117.18,32.76,8.0,3694.0,997.0,1297.0,807.0,3.6492,158900.0,NEAR OCEAN
4,-117.15,33.14,15.0,1070.0,208.0,470.0,217.0,2.3062,158900.0,<1H OCEAN


In [44]:
train_X = df_train.drop("median_house_value", axis = 1)
train_X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-117.83,34.15,20.0,2421.0,306.0,1023.0,298.0,8.0683,INLAND
1,-119.85,36.77,27.0,1510.0,344.0,847.0,295.0,2.9315,INLAND
2,-122.13,37.67,40.0,1748.0,318.0,914.0,317.0,3.8676,NEAR BAY
3,-119.81,34.46,22.0,3488.0,452.0,1479.0,458.0,7.1687,NEAR OCEAN
4,-120.47,34.65,32.0,2193.0,430.0,1074.0,377.0,2.3333,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
14442,-118.73,34.27,23.0,4550.0,762.0,2301.0,744.0,4.5560,<1H OCEAN
14443,-117.11,32.77,50.0,1729.0,355.0,617.0,337.0,3.6705,NEAR OCEAN
14444,-121.73,36.93,29.0,2931.0,535.0,1954.0,506.0,3.2917,<1H OCEAN
14445,-117.25,32.83,17.0,2075.0,262.0,704.0,241.0,10.9529,NEAR OCEAN


In [45]:
train_Y = df_train["median_house_value"].copy()
train_Y

0        451500.0
1         83200.0
2        184000.0
3        384400.0
4        130200.0
           ...   
14442    205300.0
14443    167000.0
14444    224700.0
14445    500001.0
14446    294000.0
Name: median_house_value, Length: 14447, dtype: float64

In [46]:
dev_X = df_dev.drop("median_house_value", axis = 1)
dev_X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-118.36,33.92,26.0,3695.0,1144.0,2308.0,1009.0,2.6667,<1H OCEAN
1,-118.41,34.19,45.0,1106.0,225.0,595.0,228.0,3.6625,<1H OCEAN
2,-122.48,37.73,38.0,3195.0,828.0,2410.0,778.0,3.1359,NEAR OCEAN
3,-117.18,32.76,8.0,3694.0,997.0,1297.0,807.0,3.6492,NEAR OCEAN
4,-117.15,33.14,15.0,1070.0,208.0,470.0,217.0,2.3062,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
2060,-122.02,38.37,16.0,2495.0,331.0,1118.0,338.0,6.4894,INLAND
2061,-117.04,32.97,13.0,6711.0,1256.0,3683.0,1220.0,4.5746,<1H OCEAN
2062,-120.69,35.62,43.0,3044.0,652.0,1456.0,608.0,2.4567,<1H OCEAN
2063,-118.39,34.19,36.0,904.0,191.0,627.0,191.0,2.4167,<1H OCEAN


In [47]:
dev_Y = df_dev["median_house_value"].copy()
dev_Y

0       229300.0
1       190700.0
2       350000.0
3       158900.0
4       158900.0
          ...   
2060    198000.0
2061    175700.0
2062    140000.0
2063    192900.0
2064    248900.0
Name: median_house_value, Length: 2065, dtype: float64

In [48]:
test_X = df_test.drop("median_house_value", axis = 1)
test_X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.41,37.65,32.0,3436.0,868.0,2583.0,817.0,3.5039,NEAR OCEAN
1,-117.91,33.84,25.0,1021.0,252.0,975.0,258.0,3.1250,<1H OCEAN
2,-118.35,34.08,52.0,1801.0,313.0,714.0,293.0,4.6838,<1H OCEAN
3,-121.02,37.61,33.0,1469.0,370.0,1318.0,349.0,1.7104,INLAND
4,-117.65,34.10,30.0,1461.0,341.0,1014.0,345.0,2.4667,INLAND
...,...,...,...,...,...,...,...,...,...
4123,-117.22,33.17,6.0,1487.0,362.0,810.0,322.0,3.6250,<1H OCEAN
4124,-117.40,34.58,18.0,755.0,169.0,483.0,165.0,1.4196,INLAND
4125,-118.04,33.78,25.0,3715.0,575.0,1640.0,572.0,5.7705,<1H OCEAN
4126,-118.46,34.00,52.0,888.0,206.0,376.0,194.0,3.8750,<1H OCEAN


In [49]:
test_Y = df_test["median_house_value"].copy()
test_Y

0       232400.0
1       168100.0
2       479000.0
3        59000.0
4       106000.0
          ...   
4123    135700.0
4124     64700.0
4125    247100.0
4126    372000.0
4127    153100.0
Name: median_house_value, Length: 4128, dtype: float64

In [50]:
df_train.isnull()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
14442,False,False,False,False,False,False,False,False,False,False
14443,False,False,False,False,False,False,False,False,False,False
14444,False,False,False,False,False,False,False,False,False,False
14445,False,False,False,False,False,False,False,False,False,False


In [51]:
train_X.isnull().any(axis = 1)

0        False
1        False
2        False
3        False
4        False
         ...  
14442    False
14443    False
14444    False
14445    False
14446    False
Length: 14447, dtype: bool

In [52]:
train_X[train_X.isnull().any(axis = 1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
41,-124.00,40.92,29.0,1429.0,,672.0,266.0,2.9485,<1H OCEAN
204,-120.97,37.43,27.0,1380.0,,810.0,262.0,2.1875,INLAND
403,-118.27,34.07,27.0,1190.0,,1795.0,422.0,1.7016,<1H OCEAN
413,-119.45,36.61,24.0,1302.0,,693.0,243.0,3.7917,INLAND
586,-121.30,38.58,16.0,1537.0,,1125.0,375.0,2.6471,INLAND
...,...,...,...,...,...,...,...,...,...
13865,-121.75,37.11,18.0,3167.0,,1414.0,482.0,6.8773,<1H OCEAN
13918,-118.10,33.74,32.0,2035.0,,934.0,512.0,4.2287,NEAR OCEAN
14004,-122.08,37.37,29.0,1229.0,,707.0,194.0,7.1108,NEAR BAY
14168,-119.38,36.53,38.0,1281.0,,1423.0,293.0,1.9602,INLAND


In [53]:
train_X.isnull().sum(axis = 0)

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        160
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

# Xử lý các cột bị khuyết dữ liệu

In [54]:
# sử dụng SimpleImputer
idex_null = train_X["total_bedrooms"].isnull()
imputer_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer_mean.fit(train_X["total_bedrooms"].values.reshape(-1, 1))
train_X["total_bedrooms"] = imputer_mean.transform(train_X['total_bedrooms'].values.reshape(-1, 1))

In [55]:
print(train_X["total_bedrooms"][idex_null == True])

41       538.66578
204      538.66578
403      538.66578
413      538.66578
586      538.66578
           ...    
13865    538.66578
13918    538.66578
14004    538.66578
14168    538.66578
14371    538.66578
Name: total_bedrooms, Length: 160, dtype: float64
