In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
data = pd.read_csv(url)

In [3]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


# **Preparing the dataset**

In [4]:
category_mapping = {
    '<1H OCEAN': 0,
    'INLAND': 1
}

In [5]:
df = data[(data['ocean_proximity'] == 'INLAND') | (data['ocean_proximity'] == '<1H OCEAN')]
df.loc[:, 'ocean_proximity'] = df['ocean_proximity'].map(category_mapping)
df = df.fillna(0)
df['median_house_value'] = df['median_house_value'].apply(lambda x: np.log(x))
df.reset_index(drop= True , inplace= True )

In [6]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,12.973863,0
1,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,12.287653,0
2,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,12.419570,0
3,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,12.554967,0
4,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,12.287192,0
...,...,...,...,...,...,...,...,...,...,...
15682,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,11.265745,1
15683,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,11.252859,1
15684,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,11.432799,1
15685,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,11.346871,1


In [7]:
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

In [8]:
print("Train set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (9412, 9) (9412,)
Validation set shape: (3137, 9) (3137,)
Test set shape: (3138, 9) (3138,)


# **Question 1**

In [9]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [10]:
splitting_feature_index = dt.tree_.feature[0]
feature_name = X_train.columns[splitting_feature_index]
print(f"The feature used for splitting the data is: {feature_name}")

The feature used for splitting the data is: ocean_proximity


# **Question 2**

In [11]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)

In [12]:
y_pred = rf.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"RMSE on validation set: {round(rmse, 3)}")

RMSE on validation set: 0.236


# **Question 3**

In [13]:
n_estimators_values = list(range(10, 201, 10))

best_rmse = float('inf')
best_n_estimators = None
prev_rmse = float('inf')
stop_improving = False

for n_estimators in n_estimators_values:
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=1)
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_val)  
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))  

    if rmse > prev_rmse:
        stop_improving = True
    else:
        prev_rmse = rmse

    if rmse < best_rmse:
        best_rmse = rmse
        best_n_estimators = n_estimators

    if stop_improving:
        break

print("After which value of n_estimators does RMSE stop improving:", round(best_n_estimators, 3))

After which value of n_estimators does RMSE stop improving: 50


# **Question 4**

In [14]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = list(range(10, 201, 10))

best_mean_rmse = float('inf')
best_max_depth = None
best_n_estimators = None

for max_depth in max_depth_values:
    for n_estimators in n_estimators_values:
        model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, random_state=1)
        model.fit(X_train, y_train)  
        y_pred = model.predict(X_val)  
        rmse = np.sqrt(mean_squared_error(y_val, y_pred)) 

        if rmse < best_mean_rmse:
            best_mean_rmse = rmse
            best_max_depth = max_depth
            best_n_estimators = n_estimators

print("The best max_depth using mean RMSE:", best_max_depth)

The best max_depth using mean RMSE: 25


# **Question 5**

In [15]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)

feature_importances = rf.feature_importances_

feature_importance_dict = dict(zip(X_train.columns, feature_importances))

sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importances:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: ocean_proximity, Importance: 0.35942297023963965
Feature: median_income, Importance: 0.33771113831677546
Feature: latitude, Importance: 0.096856612085106
Feature: longitude, Importance: 0.0847295257027977
Feature: housing_median_age, Importance: 0.03273270952051309
Feature: population, Importance: 0.029635542270991723
Feature: total_rooms, Importance: 0.02329984139160675
Feature: households, Importance: 0.018283167956269268
Feature: total_bedrooms, Importance: 0.017328492516300434


# **Question 6**

In [16]:
!pip install xgboost



In [17]:
import xgboost as xgb

In [18]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
# Create a watchlist for validation
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [19]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

num_round = 100
bst = xgb.train(xgb_params, dtrain, num_round, watchlist, early_stopping_rounds=10, verbose_eval=False)

y_pred = bst.predict(dval)

rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred))

xgb_params['eta'] = 0.1

bst = xgb.train(xgb_params, dtrain, num_round, watchlist, early_stopping_rounds=10, verbose_eval=False)

y_pred = bst.predict(dval)

rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred))

if rmse_03 < rmse_01:
    print("0.3")
elif rmse_03 > rmse_01:
    print("0.1")
else:
    print("Both give equal value")



0.3
