In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from xgboost import XGBClassifier , XGBRegressor

In [3]:
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.impute import SimpleImputer

In [4]:
from sklearn.pipeline import Pipeline

## Load Data

In [5]:
df = pd.read_excel('housing.xlsx')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [7]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

## Binning continous Y variable('median_house_value') and using those bins we stratify split

In [10]:
y_labels = pd.cut(df['median_house_value'] , 5, labels=[1,2,3,4,5])
y_labels.value_counts()

median_house_value
2    7870
3    4568
1    4489
4    1991
5    1722
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_df , test_df = train_test_split( df , test_size = 0.2, stratify=y_labels)

In [13]:
train_df.shape

(16512, 10)

In [14]:
test_df.shape

(4128, 10)

In [15]:
test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
19695,-121.63,39.12,32,2574,425.0,1099,391,4.3864,117500,INLAND
7283,-118.25,33.98,37,1503,392.0,1886,401,2.5637,125000,<1H OCEAN
18264,-122.08,37.38,25,830,228.0,368,174,3.3917,342900,NEAR BAY
11492,-117.99,33.69,16,1476,294.0,886,270,5.3259,216400,<1H OCEAN
3735,-118.4,34.19,35,1631,356.0,862,368,3.6007,261800,<1H OCEAN


In [16]:
train_df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        169
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [17]:
df.select_dtypes(exclude='O').columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [18]:
num_vars = df.select_dtypes(exclude='O').drop(columns=['median_house_value']).columns
cat_vars = df.select_dtypes(include='O').columns

In [19]:
num_vars

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

## Handling Missing vals & Scaling using Pipeline

In [20]:
num_pipe = Pipeline( [('imputer', SimpleImputer(strategy='median')) , ('stdsc', StandardScaler())] )

In [21]:
num_pipe.fit_transform(train_df[num_vars])

array([[ 0.65340738, -0.79376587,  0.5819438 , ...,  1.07003938,
         0.43137428, -1.55593552],
       [ 0.80847943, -0.85470871, -0.05178921, ...,  0.0628451 ,
        -0.00308947,  0.57498323],
       [-1.41255249,  1.07202256, -0.84395547, ..., -0.17597417,
         0.45464912,  0.63312671],
       ...,
       [ 0.87350965, -0.86408453, -0.13100584, ...,  0.0490005 ,
        -0.01343384,  1.02659082],
       [ 0.89351894, -0.83595707, -1.87377161, ...,  0.4202087 ,
         0.62274521,  1.52253613],
       [-0.95233867,  1.40486421,  0.50272717, ...,  0.93765044,
         0.38482459, -0.99451906]])

In [22]:
cat_pipe = Pipeline( [ ('encode' , OneHotEncoder(handle_unknown='ignore')) ] )

#### Checking if encoding pipe works...

In [23]:
cat_enc = cat_pipe.fit_transform( train_df[cat_vars])

In [24]:
cat_enc = cat_enc.toarray()

In [25]:
cat_enc.shape

(16512, 5)

### Preprocessing Parallel with ColumnTrasnformer

In [26]:
from sklearn.compose import ColumnTransformer

In [27]:
num_pipe # this is a serial operations...impute then cat_pipe...

0,1,2
,steps,"[('imputer', ...), ('stdsc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


#### manual operation for category

In [28]:
preprocess_pipe = ColumnTransformer( [ ('cat_enc', OneHotEncoder(handle_unknown='ignore') , cat_vars) , ('num_pipe', num_pipe, num_vars) ] ) 

In [29]:
preprocess_pipe # parallel operations of both cat and num

0,1,2
,transformers,"[('cat_enc', ...), ('num_pipe', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


#### pipe operation for category

In [30]:
preprocessing_pipe = ColumnTransformer( [ ('cat_enc', cat_pipe , cat_vars) , ('num_pipe', num_pipe, num_vars) ] ) 

### Preprocessing Pipeline(col transformer) - numeric + categorical data togather/parallely

In [31]:
preprocessing_pipe

0,1,2
,transformers,"[('cat_enc', ...), ('num_pipe', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [32]:
pd.DataFrame(preprocessing_pipe.fit_transform(df))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,0.0,1.0,0.0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766
1,0.0,0.0,0.0,1.0,0.0,-1.322844,1.043185,-0.607019,2.045890,1.357143,0.861439,1.669961,2.332238
2,0.0,0.0,0.0,1.0,0.0,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699
3,0.0,0.0,0.0,1.0,0.0,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968
4,0.0,0.0,0.0,1.0,0.0,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0.0,1.0,0.0,0.0,0.0,-0.758826,1.801647,-0.289187,-0.444985,-0.388283,-0.512592,-0.443449,-1.216128
20636,0.0,1.0,0.0,0.0,0.0,-0.818722,1.806329,-0.845393,-0.888704,-0.922403,-0.944405,-1.008420,-0.691593
20637,0.0,1.0,0.0,0.0,0.0,-0.823713,1.778237,-0.924851,-0.174995,-0.123608,-0.369537,-0.174042,-1.142593
20638,0.0,1.0,0.0,0.0,0.0,-0.873626,1.778237,-0.845393,-0.355600,-0.304827,-0.604429,-0.393753,-1.054583


In [33]:
xgb_reg = XGBRegressor(n_jobs=-1)

### Model Building Pipeline - preprocess then model

In [34]:
final_pipe = Pipeline ( [ ('data_preprocess' , preprocessing_pipe ) , ('xgb_model' , xgb_reg) ] )

### HyperParameter Tuning

In [35]:
# as we give whole pipline in searchCV...
# we have to specify which param is of what operation/model inside the given pipeline in searchCV..
# here we only have params for xgb_model, we specify that before the param name with '__' 

xgb_param = {
    'xgb_model__n_estimators' : [50, 60 ,100],
    'xgb_model__max_depth' : [3, 5 , 7] ,
    'xgb_model__learning_rate' : [0.2 ,0.6 ,1 , 10] ,
    'xgb_model__colsample_bytree' : [0.3 ,0.6, 0.8]
}

In [36]:
from sklearn.model_selection import RandomizedSearchCV

In [37]:
rcv_pipe = RandomizedSearchCV(
    estimator = final_pipe , param_distributions = xgb_param , n_iter=10 
 
)

In [38]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5226,-118.25,33.93,36,2452,734.0,2664,667,0.9298,100000,<1H OCEAN
11227,-117.94,33.8,28,2914,489.0,1500,499,4.9429,254800,<1H OCEAN
1792,-122.38,37.91,18,3507,711.0,1224,676,5.0524,269800,NEAR BAY
7813,-118.11,33.9,35,2604,495.0,1465,470,4.4896,184600,<1H OCEAN
4385,-118.26,34.08,52,984,276.0,994,260,2.3816,166700,<1H OCEAN


In [39]:
rcv_pipe.fit( train_df, train_df['median_house_value'])

Traceback (most recent call last):
  File "D:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "D:\Anaconda\Lib\site-packages\sklearn\metrics\_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "D:\Anaconda\Lib\site-packages\sklearn\pipeline.py", line 1189, in score
    return self.steps[-1][1].score(Xt, y, **score_params)
           ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Anaconda\Lib\site-packages\sklearn\base.py", line 639, in score
    return r2_score(y, y_pred, sample_weight=sample_weight)
  File "D:\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 218, in wrapper
    return func(*args, **kwargs)
  File "D:\Anaconda\Lib\site-packages\sklearn\metrics\_regression.py", line 1276, in r2_score
    _check_reg_targets_with_floating_dtype(
    ~~~~~~~~~~~~~~~~~~~~~~~~

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_distributions,"{'xgb_model__colsample_bytree': [0.3, 0.6, ...], 'xgb_model__learning_rate': [0.2, 0.6, ...], 'xgb_model__max_depth': [3, 5, ...], 'xgb_model__n_estimators': [50, 60, ...]}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,transformers,"[('cat_enc', ...), ('num_pipe', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [40]:
y_pred = rcv_pipe.predict(test_df)
y_pred

array([118450.555, 126495.95 , 282509.66 , ..., 199234.77 , 176851.98 ,
       406991.97 ], dtype=float32)

In [41]:
from sklearn.metrics import mean_absolute_error

In [42]:
y_test = test_df['median_house_value']
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error : {mae}')

Mean Absolute Error : 30629.197265625


## Extracting Final Model Pipeline

In [43]:
from joblib import dump , load

In [44]:
dump(rcv_pipe, 'Final_Model_pipeline.joblib')

['Final_Model_pipeline.joblib']