In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.pipeline import Pipeline
%matplotlib inline

In [2]:
df = fetch_california_housing()

In [3]:
X = pd.DataFrame(df.data, columns=df.feature_names)

In [5]:
X.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [8]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [10]:
y = pd.DataFrame(df.target, columns=df.target_names)

In [11]:
y.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [212]:
ss = StandardScaler()

In [213]:
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [214]:
lr = LinearRegression()

In [215]:
lr.fit(X_train, y_train)

In [217]:
y_pred = lr.predict(X_test)

In [218]:
mse = mean_squared_error(y_test, y_pred)

In [219]:
r2 = r2_score(y_test, y_pred)

In [220]:
print(f'MSE: {mse}')
print(f'R2: {r2}')

MSE: 0.5558915986952442
R2: 0.575787706032451


In [221]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=42)

In [222]:
xgb_reg.fit(X_train, y_train)

In [223]:
y_pred = xgb_reg.predict(X_test)

In [224]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [225]:
print(f'MSE: {mse}')
print(f'R2: {r2}')

MSE: 0.3007887797646296
R2: 0.770461905588887


In [228]:
random_search = RandomizedSearchCV(xgb_reg, param_distributions={'n_estimators': np.arange(10, 100, 10), 'max_depth': np.arange(3, 10, 1)}, n_iter=10, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=1, random_state=42)

In [229]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [230]:
y_pred = random_search.predict(X_test)

In [231]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [232]:
print(f'MSE: {mse}')
print(f'R2: {r2}')

MSE: 0.22377009808156403
R2: 0.8292364431278896


In [234]:
random_search.best_params_

{'n_estimators': 80, 'max_depth': 5}

In [237]:
random_search = RandomizedSearchCV(xgb_reg, param_distributions={'n_estimators': np.arange(1, 200, 5), 'max_depth': np.arange(5, 15, 1), 'learning_rate': np.arange(0.001, 1, 0.1)}, n_iter=30, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=1, random_state=42)

In [238]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [239]:
y_pred = random_search.predict(X_test)

In [240]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [241]:
print(f'MSE: {mse}')
print(f'R2: {r2}')

MSE: 0.21647689842996257
R2: 0.8348020335448538


In [243]:
#save the model
joblib.dump(random_search, 'california_housing_xgb.pkl')

['california_housing_xgb.pkl']

In [244]:
joblib.dump(ss, 'california_housing_ss.pkl')

['california_housing_ss.pkl']

In [247]:
pipe = Pipeline([('scaler', ss), ('model', random_search)])
joblib.dump(pipe, 'california_housing_pipe.pkl')

['california_housing_pipe.pkl']