In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [75]:
df_raw = pd.read_csv('./csv/housing.csv')

In [76]:
df_raw.shape

(20640, 10)

In [77]:
df_raw.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [78]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [79]:
df_filled = df_raw.copy()
df_filled['total_bedrooms'] = df_filled['total_bedrooms'].fillna(df_filled['total_bedrooms'].mean())
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [80]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()

In [81]:
df_num_only = df_filled.drop(columns=['ocean_proximity'])
df_num_only.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [82]:
df_scl = scaler.fit_transform(df_num_only)
df_scl.shape

(20640, 9)

In [83]:
categorical_col = df_filled[['ocean_proximity']]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_array = encoder.fit_transform(categorical_col)

encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['ocean_proximity']))

encoded_df.index = df_filled.index

df = pd.concat([df_filled.drop('ocean_proximity', axis=1), encoded_df], axis=1)

In [84]:
df.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0


In [85]:
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(20640, 13) (20640,)


In [86]:
from sklearn.tree import DecisionTreeRegressor
model_dtr = DecisionTreeRegressor(max_depth=5, min_samples_split=20, random_state=20)

In [87]:
from sklearn.metrics import r2_score

In [88]:
model_dtr.fit(X_train, y_train)

y_pred = model_dtr.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2: {r2:.2f}")

R2: 0.61


In [89]:
model_dtr = DecisionTreeRegressor(max_depth=10, min_samples_split=20, random_state=20)

model_dtr.fit(X_train, y_train)

y_pred = model_dtr.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2: {r2:.2f}")

R2: 0.72


In [90]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

In [91]:
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2: {r2:.2f}")

R2: 0.63


In [92]:
model_lr.intercept_

np.float64(-2256620.7988544893)

In [93]:
model_lr.coef_

array([-2.68382734e+04, -2.54683520e+04,  1.10218508e+03, -6.02150567e+00,
        1.02789395e+02, -3.81729064e+01,  4.82527528e+01,  3.94739752e+04,
       -1.89265829e+04, -5.87132390e+04,  1.17198490e+05, -2.40632251e+04,
       -1.54954428e+04])

In [94]:
from sklearn.svm import SVR
model_svr = SVR()

In [95]:
model_svr.fit(X_train, y_train)

y_pred = model_svr.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2: {r2:.2f}")

R2: -0.05


In [96]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=20)

In [97]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
model_rfr.fit(X_train, y_train)
y_pred = model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.2f}")
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.2f}")
R2 = r2_score(y_test, y_pred)
print(f"R2: {R2:.2f}")

MSE: 2458357347.29
MAE: 32330.47
R2: 0.81


In [98]:
X_sin_lat_y_alt = X.drop(columns=['latitude', 'longitude'])
X_train, X_test, y_train, y_test = train_test_split(X_sin_lat_y_alt, y, test_size=0.2, random_state=42)


In [99]:
model_rfr.fit(X_train, y_train)
y_pred = model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.2f}")
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.2f}")
R2 = r2_score(y_test, y_pred)
print(f"R2: {R2:.6f}")

MSE: 4018058715.22
MAE: 43753.56
R2: 0.693374
