In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing


In [153]:
train = pd.read_csv('/kaggle/input/playground-series-s3e1/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e1/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e1/sample_submission.csv')

In [192]:
test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,euc
0,1.7062,35.0,4.966368,1.096539,-0.207062,2.844411,128.169751
1,1.3882,22.0,4.187035,1.098229,0.575661,3.180218,123.065538
2,7.7197,21.0,7.129436,0.959276,-0.03339,2.888889,122.510523
3,4.6806,49.0,4.769697,1.048485,-0.696064,1.74359,123.174447
4,3.1284,25.0,3.765306,1.081633,2.512462,2.003827,123.126411


In [155]:
print(train.shape)
print(test.shape)

(37137, 10)
(24759, 9)


In [176]:
train.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,euc
count,57777.0,57777.0,57777.0,57777.0,57777.0,57777.0,57777.0,57777.0,57777.0
mean,3.858046,26.979559,5.258104,1.074519,8.362654000000001e-17,2.91677,35.592118,-119.559822,124.754104
std,1.838287,12.374508,1.771502,0.294086,1.000009,6.575793,2.102374,1.984615,2.474247
min,0.4999,1.0,0.846154,0.333333,-1.259497,0.692308,32.54,-124.35,119.153441
25%,2.5882,17.0,4.381443,1.015306,-0.5560063,2.407625,33.93,-121.8,122.801064
50%,3.5214,26.0,5.126761,1.052941,-0.2086627,2.769231,34.21,-118.46,123.282906
75%,4.7109,36.0,5.936031,1.091837,0.1907023,3.179673,37.7,-118.02,127.468571
max,15.0001,52.0,141.909091,34.066667,27.2955,1243.333333,41.95,-114.31,131.188004


In [157]:
train.isna().sum()

id             0
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [177]:
train.corr() * 100

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,euc
MedInc,100.0,-9.407492,46.943084,-6.292778,-0.446493,1.080269,-6.763485,-3.310633,0.843695
HouseAge,-9.407492,100.0,-13.793587,-5.170439,-26.157591,0.85898,1.871866,-9.6837,7.856627
AveRooms,46.943084,-13.793587,100.0,71.061435,-5.652541,0.172227,9.621253,-5.252711,6.403602
AveBedrms,-6.292778,-5.170439,71.061435,100.0,-4.11715,-0.544242,4.703082,1.073866,0.350966
Population,-0.446493,-26.157591,-5.652541,-4.11715,100.0,4.043588,-8.349032,8.031967,-8.223648
AveOccup,1.080269,0.85898,0.172227,-0.544242,4.043588,100.0,-0.682537,1.109793,-1.018314
Latitude,-6.763485,1.871866,9.621253,4.703082,-8.349032,-0.682537,100.0,-93.265614,96.125951
Longitude,-3.310633,-9.6837,-5.252711,1.073866,8.031967,1.109793,-93.265614,100.0,-99.596203
euc,0.843695,7.856627,6.403602,0.350966,-8.223648,-1.018314,96.125951,-99.596203,100.0


In [186]:
train = train.drop(['id', 'Longitude', 'Latitude'], axis = 1)
test = test.drop(['id', 'Longitude', 'Latitude'], axis = 1)

In [162]:
original_data = fetch_california_housing()
original_data = pd.DataFrame(data=np.hstack([original_data['data'], original_data['target'].reshape(-1, 1)]), columns=train.columns)

train = pd.concat([train, original_data]).reset_index(drop=True)

In [179]:
train['euc'] = (train['Longitude'] ** 2 + train['Latitude'] ** 2) ** 0.5
test['euc'] = (test['Longitude'] ** 2 + test['Latitude'] ** 2) ** 0.5

In [166]:
ss = StandardScaler()
train['Population'] = ss.fit_transform(train[['Population']])
test['Population'] = ss.transform(test[['Population']])

In [167]:
y = train.pop('MedHouseVal')
X = train

In [168]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57777 entries, 0 to 57776
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      57777 non-null  float64
 1   HouseAge    57777 non-null  float64
 2   AveRooms    57777 non-null  float64
 3   AveBedrms   57777 non-null  float64
 4   Population  57777 non-null  float64
 5   AveOccup    57777 non-null  float64
 6   Latitude    57777 non-null  float64
 7   Longitude   57777 non-null  float64
 8   euc         57777 non-null  float64
dtypes: float64(9)
memory usage: 4.0 MB


In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [170]:
X_train.size, X_test.size

(415989, 104004)

In [171]:
from catboost import CatBoostRegressor

In [172]:
model = CatBoostRegressor()
parameters = { 'learning_rate': [0.01,0.05, 0.1],
              'depth': [6, 8, 10],
              'l2_leaf_reg': [10, 15, 30],
              'random_strength': [0.03, 0.1, 0.07],
             'loss_function': ['RMSE']}
Model = RandomizedSearchCV(estimator=Model,
                     param_distributions = parameters,
                     scoring='accuracy',
                     cv=5,
                     refit=True,
                     n_jobs=1)

In [187]:
model.fit(X, y)

Learning rate set to 0.077721
0:	learn: 1.1084352	total: 8.61ms	remaining: 8.6s
1:	learn: 1.0634023	total: 16.1ms	remaining: 8.03s
2:	learn: 1.0220475	total: 24.2ms	remaining: 8.05s
3:	learn: 0.9855638	total: 32.4ms	remaining: 8.07s
4:	learn: 0.9539203	total: 40.5ms	remaining: 8.06s
5:	learn: 0.9247654	total: 48.3ms	remaining: 8.01s
6:	learn: 0.8973791	total: 55.8ms	remaining: 7.92s
7:	learn: 0.8725612	total: 63.1ms	remaining: 7.83s
8:	learn: 0.8502724	total: 70.2ms	remaining: 7.73s
9:	learn: 0.8307950	total: 77.8ms	remaining: 7.71s
10:	learn: 0.8133415	total: 85.1ms	remaining: 7.65s
11:	learn: 0.7974198	total: 92.9ms	remaining: 7.65s
12:	learn: 0.7833690	total: 100ms	remaining: 7.62s
13:	learn: 0.7664842	total: 108ms	remaining: 7.62s
14:	learn: 0.7518593	total: 116ms	remaining: 7.59s
15:	learn: 0.7382933	total: 123ms	remaining: 7.58s
16:	learn: 0.7288718	total: 131ms	remaining: 7.55s
17:	learn: 0.7195368	total: 138ms	remaining: 7.5s
18:	learn: 0.7118085	total: 145ms	remaining: 7.48s
1

<catboost.core.CatBoostRegressor at 0x7fdef8405450>

In [188]:
prediction = model.predict(X_test)

In [189]:
mean_squared_error(prediction, y_test)

0.21933172027827957

In [193]:
pre = model.predict(test)

CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:81: At position 6 should be feature with name Latitude (found euc).

In [181]:
sample_submission['MedHouseVal'] = pre

In [183]:
sample_submission.to_csv('playground_submmission_file.csv', index = False)