#### Catboost sample

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

import os

In [3]:
data_root = r'C:\kaggle\playground_series_s3e1\playground-series-s3e1'
train_filepath = os.path.join(data_root, 'train.csv')
test_filepath = os.path.join(data_root, 'test.csv')
sample_path = os.path.join(data_root, 'sample_submission.csv')

train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)
sample = pd.read_csv(sample_path)

train_df = train_df.drop('id', axis=1)
train_df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5


In [4]:
extra_data = fetch_california_housing()
train_data2 = pd.DataFrame(extra_data['data'])
train_data2['MedHouseVal'] = extra_data['target']
train_data2.columns = train_df.columns
train_df['generated'] = 1
test_df['generated'] = 1
train_data2['generated'] = 0
train_df = pd.concat([train_df, train_data2],axis=0).drop_duplicates()
print(train_df.shape)
train_df.head()

(57777, 10)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,generated
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98,1
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5,1


In [5]:
train_df['r'] = np.sqrt(train_df['Latitude']**2 + train_df['Longitude']**2)
train_df['theta'] = np.arctan2(train_df['Latitude'], train_df['Longitude'])

test_df['r'] = np.sqrt(test_df['Latitude']**2 + test_df['Longitude']**2)
test_df['theta'] = np.arctan2(test_df['Latitude'], test_df['Longitude'])

In [6]:
from sklearn.decomposition import PCA

def pca(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y)
    '''
    coordinates = data[['Latitude','Latitude']].values
    pca_obj = PCA().fit(coordinates)
    pca_x = pca_obj.transform(data[['Latitude', 'Longitude']].values)[:,0]
    pca_y = pca_obj.transform(data[['Latitude', 'Longitude']].values)[:,1]
    return pca_x, pca_y

train_df['pca_x'], train_df['pca_y'] = pca(train_df)
test_df['pca_x'], test_df['pca_y'] = pca(test_df)

In [7]:
def crt_crds(df): 
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                      (np.sin(np.radians(15)) * df['Latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + \
                      (np.sin(np.radians(15)) * df['Longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                      (np.sin(np.radians(30)) * df['Latitude'])
    
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + \
                      (np.sin(np.radians(30)) * df['Longitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + \
                      (np.sin(np.radians(45)) * df['Latitude'])
    return df

train = crt_crds(train_df)
test = crt_crds(test_df)

In [8]:
import reverse_geocoder as rg
from sklearn.preprocessing import LabelEncoder

def geocoder(df):
    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    return results

results = geocoder(train_df)
train_df['place'] = [x['admin2'] for x in results]
results = geocoder(test_df)
test_df['place'] = [x['admin2'] for x in results]

places = ['Los Angeles County', 'Orange County', 'Kern County',
          'Alameda County', 'San Francisco County', 'Ventura County',
          'Santa Clara County', 'Fresno County', 'Santa Barbara County',
          'Contra Costa County', 'Yolo County', 'Monterey County',
          'Riverside County', 'Napa County']

def replace(x):
    if x in places:
        return x
    else:
        return 'Other'
    
train_df['place'] = train_df['place'].apply(lambda x: replace(x))
test_df['place'] = test_df['place'].apply(lambda x: replace(x))

# le = LabelEncoder()
# train_df['place'] = le.fit_transform(train_df['place'])
# test_df['place'] = le.transform(test_df['place'])

test_df = pd.get_dummies(test_df)
train_df = pd.get_dummies(train_df)

Loading formatted geocoded file...


In [9]:
train_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,generated,...,place_Monterey County,place_Napa County,place_Orange County,place_Other,place_Riverside County,place_San Francisco County,place_Santa Barbara County,place_Santa Clara County,place_Ventura County,place_Yolo County
0,2.3859,15.0,3.827160,1.112100,1280.0,2.486989,34.60,-120.12,0.980,1,...,0,0,0,0,0,0,1,0,0,0
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1,...,0,0,0,1,0,0,0,0,0,0
2,4.7750,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1,...,0,0,0,0,0,0,1,0,0,0
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1,...,0,0,0,1,0,0,0,0,0,0
4,3.7500,52.0,4.284404,1.069246,1793.0,1.604790,37.80,-122.41,4.500,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0,...,0,0,0,1,0,0,0,0,0,0
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0,...,0,0,0,1,0,0,0,0,0,0
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0,...,0,0,0,1,0,0,0,0,0,0
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
train_df.isna().any()

MedInc                        False
HouseAge                      False
AveRooms                      False
AveBedrms                     False
Population                    False
AveOccup                      False
Latitude                      False
Longitude                     False
MedHouseVal                   False
generated                     False
r                             False
theta                         False
pca_x                         False
pca_y                         False
rot_15_x                      False
rot_15_y                      False
rot_30_x                      False
rot_30_y                      False
rot_45_x                      False
place_Alameda County          False
place_Contra Costa County     False
place_Fresno County           False
place_Kern County             False
place_Los Angeles County      False
place_Monterey County         False
place_Napa County             False
place_Orange County           False
place_Other                 

In [11]:
X = train_df.drop('MedHouseVal', axis=1)
y = train_df.MedHouseVal
X_test = test_df.drop('id', axis=1)

In [12]:
import catboost
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

n_folds = 12

MAX_ITER = 15000
PATIENCE = 1000
DISPLAY_FREQ = 100

eval_predsCB = []
predsCB = []

k_fold = KFold(n_splits=n_folds, random_state=42, shuffle=True)

MODEL_PARAMS = {
                'random_seed': 1234,    
#                 'learning_rate': 0.1,   # 0.15: 0.5678, 0.12: 0.5685, 0.1: 0.56757, 0.05: 0.57, 0.01, 0.57             
                'iterations': MAX_ITER,
                'early_stopping_rounds': PATIENCE,
#                 'metric_period': DISPLAY_FREQ,
                'use_best_model': True,
                'eval_metric': 'RMSE',
                'verbose': 1000,
#                 'task_type': 'GPU'
               }


for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostRegressor(**MODEL_PARAMS)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
#           metric_period = DISPLAY_FREQ
         )
    predsCB.append(model.predict(X_test))
#     eval_predsCB.append(model.predict(X))
#     print("RMSE valid = {}".format(mean_squared_error(y_valid, model.predict(X_valid))))
#     print("RMSE full = {}".format(mean_squared_error(y, model.predict(X))))

Learning rate set to 0.018203
0:	learn: 1.1454733	test: 1.1265549	best: 1.1265549 (0)	total: 150ms	remaining: 37m 30s
1000:	learn: 0.5190909	test: 0.5200197	best: 0.5200144 (999)	total: 11.2s	remaining: 2m 36s
2000:	learn: 0.4926820	test: 0.5094221	best: 0.5093823 (1996)	total: 22.2s	remaining: 2m 24s
3000:	learn: 0.4755944	test: 0.5052045	best: 0.5051921 (2997)	total: 33.2s	remaining: 2m 12s
4000:	learn: 0.4620314	test: 0.5034917	best: 0.5034891 (3995)	total: 43.7s	remaining: 2m
5000:	learn: 0.4503662	test: 0.5018580	best: 0.5018580 (5000)	total: 54s	remaining: 1m 47s
6000:	learn: 0.4401604	test: 0.5012489	best: 0.5012028 (5965)	total: 1m 3s	remaining: 1m 35s
7000:	learn: 0.4309418	test: 0.5007586	best: 0.5007491 (6996)	total: 1m 14s	remaining: 1m 25s
8000:	learn: 0.4226009	test: 0.5004527	best: 0.5004396 (7919)	total: 1m 25s	remaining: 1m 14s
9000:	learn: 0.4147475	test: 0.5004346	best: 0.5003591 (8881)	total: 1m 35s	remaining: 1m 3s
Stopped by overfitting detector  (1000 iterations 

In [13]:
from xgboost import XGBRegressor

# n_folds = 20
k_fold = KFold(n_splits=n_folds, random_state=42, shuffle=True)

eval_predsXB = []
predsXB = []

PATIENCE = 200

MODEL_PARAMS = {       'n_estimators': 500, #1000, 5000
#                        'learning_rate': 0.05,
                       'max_depth': 4, # 3
                       'colsample_bytree': 0.9, # 0.95
                       'subsample': 1,
                       'reg_lambda': 20,
                       'early_stopping_rounds': PATIENCE,
#                        'tree_method': 'gpu_hist',
                       'seed': 1
}

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = XGBRegressor(**MODEL_PARAMS)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
#           early_stopping_rounds = PATIENCE,
          verbose = 100
         )
    predsXB.append(model.predict(X_test))
#     eval_predsXB.append(model.predict(X))

[0]	validation_0-rmse:1.47099
[100]	validation_0-rmse:0.52004
[200]	validation_0-rmse:0.51384
[300]	validation_0-rmse:0.51310
[400]	validation_0-rmse:0.51227
[499]	validation_0-rmse:0.51193
[0]	validation_0-rmse:1.47559
[100]	validation_0-rmse:0.50936
[200]	validation_0-rmse:0.50409
[300]	validation_0-rmse:0.50257
[400]	validation_0-rmse:0.50265
[499]	validation_0-rmse:0.50303
[0]	validation_0-rmse:1.48731
[100]	validation_0-rmse:0.54070
[200]	validation_0-rmse:0.53281
[300]	validation_0-rmse:0.53027
[400]	validation_0-rmse:0.52893
[499]	validation_0-rmse:0.52942
[0]	validation_0-rmse:1.48492
[100]	validation_0-rmse:0.53379
[200]	validation_0-rmse:0.52619
[300]	validation_0-rmse:0.52202
[400]	validation_0-rmse:0.52111
[499]	validation_0-rmse:0.52068
[0]	validation_0-rmse:1.46493
[100]	validation_0-rmse:0.53779
[200]	validation_0-rmse:0.53084
[300]	validation_0-rmse:0.53008
[400]	validation_0-rmse:0.52938
[499]	validation_0-rmse:0.53010
[0]	validation_0-rmse:1.51208
[100]	validation_0-r

In [14]:
import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor

# n_folds = 20
k_fold = KFold(n_splits=n_folds, random_state=42, shuffle=True)

eval_predsLB = []
predsLB = []

MODEL_PARAMS = {
                       'learning_rate': 0.01,
                       'max_depth': 9,
                       'num_leaves': 90,
                       'colsample_bytree': 0.8,
                       'subsample': 0.9,
                       'subsample_freq': 5,
                       'min_child_samples': 36,
                       'reg_lambda': 28,
                       'n_estimators': 20000,
                       'metric': 'rmse',
                       'random_state': 1
}

callbacks = [lgbm.early_stopping(30, verbose=1), lgbm.log_evaluation(period=0)]

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = lgbm.LGBMRegressor(**MODEL_PARAMS)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
#           early_stopping_rounds = PATIENCE,
          callbacks=callbacks
         )
    predsLB.append(model.predict(X_test))
#     eval_predsLB.append(model.predict(X))

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1604]	valid_0's rmse: 0.500394
Early stopping, best iteration is:
[1958]	valid_0's rmse: 0.489341
Early stopping, best iteration is:
[1958]	valid_0's rmse: 0.489341
Early stopping, best iteration is:
[1958]	valid_0's rmse: 0.489341
Early stopping, best iteration is:
[1958]	valid_0's rmse: 0.489341
Early stopping, best iteration is:
[1958]	valid_0's rmse: 0.489341
Early stopping, best iteration is:
[1810]	valid_0's rmse: 0.48345
Early stopping, best iteration is:
[1810]	valid_0's rmse: 0.48345
Early stopping, best iteration is:
[1810]	valid_0's rmse: 0.48345
Early stopping, best iteration is:
[1810]	valid_0's rmse: 0.48345
Early stopping, best iteration is:
[1810]	valid_0's rmse: 0.48345
Early stopping, best iteration is:
[1810]	valid_0's rmse: 0.48345


In [15]:
a = 0.4
b = 0.2
c = 0.4

In [16]:
predCB = np.average(np.array(predsCB),axis=0)
predXB = np.average(np.array(predsXB),axis=0)
predLB = np.average(np.array(predsLB),axis=0)
pred = predCB * a + predXB * b + predLB * c

In [18]:
sample['MedHouseVal'] = pred
sample

Unnamed: 0,id,MedHouseVal
0,37137,0.663369
1,37138,1.000313
2,37139,4.024241
3,37140,3.326043
4,37141,2.442406
...,...,...
24754,61891,2.586414
24755,61892,1.965501
24756,61893,1.226228
24757,61894,3.706760


In [19]:
sample.to_csv('boost_ensemble.csv', index=False)