In [233]:
import pandas as pd
import pickle
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')


In [234]:
df=pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [235]:
df.shape

(20640, 10)

In [236]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [237]:
from sklearn.impute import SimpleImputer
df['total_bedrooms'] = SimpleImputer(strategy='median').fit_transform(df[['total_bedrooms']])
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [238]:
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)


In [239]:
cols = ['ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']
df[cols] = df[cols].astype(int)

In [240]:

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgb = XGBRegressor(random_state=42)
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10]
}
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)
r2_scorer = make_scorer(r2_score)

grid = GridSearchCV(xgb, param_grid, scoring=rmse_scorer, cv=5, verbose=2, n_jobs=-1)
grid.fit(X, y)
print('Best params:', grid.best_params_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [None]:
best_xgb = grid.best_estimator_

r2_scores = cross_val_score(best_xgb, X, y, scoring='r2', cv=5)

print(f'R²:{r2_scores.mean()}')

R²:0.441178901864118


In [None]:

xgb = XGBRegressor(n_estimators=100, random_state=42,learning_rate=0.01,max_depth=3)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'XGBoost MSE: {mse:.2f}')
rmse = mse ** 0.5
print(f'XGBoost RMSE: {rmse:.2f}')

with open('xgb_house_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)
with open('house_model_columns.pkl', 'wb') as f:
    pickle.dump(list(X.columns), f)


XGBoost MSE: 6458873792.42
XGBoost RMSE: 80367.12
