# Import Dependencies

In [1]:
# Basics
import pandas as pd #DataFrame
import numpy as np #Linear Algebra

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor
import catboost
from bayes_opt import BayesianOptimization
#metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

#pipeline
from sklearn.pipeline import Pipeline

#os
import os

# Load Data

In [2]:
train = pd.read_csv('new_data/full_train.csv')
test = pd.read_csv('new_data/full_test.csv')

In [3]:
train.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb,date
0,0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,1
2,2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,2
3,3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,3
4,4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,4


In [4]:
test.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,Cfb,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb
0,375734,0.0,0.833333,339.88,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
1,375735,0.0,0.833333,334.63,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
2,375736,0.0,0.833333,337.83,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
3,375737,0.0,0.833333,345.81,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
4,375738,0.0,0.833333,357.39,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0


# Split into Train and Validation Data

In [5]:
y = train['contest-tmp2m-14d__tmp2m']
X = train.drop(['contest-tmp2m-14d__tmp2m'],axis=1)

In [6]:
X.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb,date
0,0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,1
2,2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,2
3,3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,3
4,4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,4


In [7]:
y.head()

0    28.744480
1    28.370585
2    28.133059
3    28.256798
4    28.372353
Name: contest-tmp2m-14d__tmp2m, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3)

# Pipeline

In [22]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', catboost.CatBoostRegressor(verbose=0))
])


# Baseline Model

In [10]:
pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)

Learning rate set to 0.100855
0:	learn: 8.9945100	total: 218ms	remaining: 3m 37s
1:	learn: 8.2152611	total: 281ms	remaining: 2m 20s
2:	learn: 7.5091480	total: 336ms	remaining: 1m 51s
3:	learn: 6.8912003	total: 397ms	remaining: 1m 38s
4:	learn: 6.3238281	total: 459ms	remaining: 1m 31s
5:	learn: 5.8154037	total: 528ms	remaining: 1m 27s
6:	learn: 5.3642906	total: 593ms	remaining: 1m 24s
7:	learn: 4.9617398	total: 659ms	remaining: 1m 21s
8:	learn: 4.6083087	total: 734ms	remaining: 1m 20s
9:	learn: 4.2866629	total: 805ms	remaining: 1m 19s
10:	learn: 4.0060357	total: 871ms	remaining: 1m 18s
11:	learn: 3.7526481	total: 939ms	remaining: 1m 17s
12:	learn: 3.5301977	total: 1s	remaining: 1m 15s
13:	learn: 3.3303115	total: 1.06s	remaining: 1m 14s
14:	learn: 3.1564490	total: 1.11s	remaining: 1m 13s
15:	learn: 2.9874899	total: 1.17s	remaining: 1m 11s
16:	learn: 2.8416539	total: 1.23s	remaining: 1m 11s
17:	learn: 2.7125994	total: 1.29s	remaining: 1m 10s
18:	learn: 2.6005451	total: 1.35s	remaining: 1m

### Accuracy

In [11]:
prediction_df = pd.concat([pd.Series(y_hat), X_test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,20.026597,329308
1,12.749582,295719
2,4.294111,300581
3,3.352408,16207
4,12.855743,23915
...,...,...
75142,13.098018,129579
75143,14.062920,68410
75144,5.935581,224253
75145,26.705889,60229


In [12]:
def accuracy(y_test,y_hat):
    acc = r2_score(y_test, y_hat)
    print("R2 Accuracy:", acc)
    accMSE = mean_squared_error(y_test, y_hat)
    print("MSE Accuracy:", accMSE)
    accMAE = mean_absolute_error(y_test, y_hat)
    print("MAE Accuracy:", accMAE)
    accRMSE = mean_squared_error(y_test, y_hat, squared=False)
    print("RMSE Accuracy:", accRMSE)

In [13]:
accuracy = accuracy(y_test,y_hat)

R2 Accuracy: 0.9969988947225567
MSE Accuracy: 0.29259759643633687
MAE Accuracy: 0.422551270626546
RMSE Accuracy: 0.5409229117317336


# Tune Hyperparameters

In [39]:
param_grid = {
    'classifier__iterations': [200],
    'classifier__learning_rate': [0.1],
    'classifier__depth': [3, 5, 7],
    'classifier__l2_leaf_reg': [1, 5, 9],
    'classifier__bagging_temperature': [0, 0.4, 0.8, 1],
    'classifier__random_strength': [0.1, 0.5, 1, 2, 5],
    'classifier__border_count': [64, 128],
    'classifier__task_type': ['GPU'],
}




In [40]:
grid = GridSearchCV(pipe, param_grid, verbose=3, cv=3)

In [41]:
# grid.fit(X_train,y_train)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
[CV 1/3] END classifier__bagging_temperature=0, classifier__border_count=64, classifier__depth=3, classifier__iterations=200, classifier__l2_leaf_reg=1, classifier__learning_rate=0.1, classifier__random_strength=0.1, classifier__task_type=GPU;, score=0.982 total time=   2.8s
[CV 2/3] END classifier__bagging_temperature=0, classifier__border_count=64, classifier__depth=3, classifier__iterations=200, classifier__l2_leaf_reg=1, classifier__learning_rate=0.1, classifier__random_strength=0.1, classifier__task_type=GPU;, score=0.982 total time=   2.7s
[CV 3/3] END classifier__bagging_temperature=0, classifier__border_count=64, classifier__depth=3, classifier__iterations=200, classifier__l2_leaf_reg=1, classifier__learning_rate=0.1, classifier__random_strength=0.1, classifier__task_type=GPU;, score=0.982 total time=   2.5s
[CV 1/3] END classifier__bagging_temperature=0, classifier__border_count=64, classifier__depth=3, classifier

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('classifier',
                                        <catboost.core.CatBoostRegressor object at 0x00000285D26A9310>)]),
             param_grid={'classifier__bagging_temperature': [0, 0.4, 0.8, 1],
                         'classifier__border_count': [64, 128],
                         'classifier__depth': [3, 5, 7],
                         'classifier__iterations': [200],
                         'classifier__l2_leaf_reg': [1, 5, 9],
                         'classifier__learning_rate': [0.1],
                         'classifier__random_strength': [0.1, 0.5, 1, 2, 5],
                         'classifier__task_type': ['GPU']},
             verbose=3)

In [42]:
grid.best_params_

{'classifier__bagging_temperature': 0.4,
 'classifier__border_count': 128,
 'classifier__depth': 7,
 'classifier__iterations': 200,
 'classifier__l2_leaf_reg': 1,
 'classifier__learning_rate': 0.1,
 'classifier__random_strength': 0.1,
 'classifier__task_type': 'GPU'}

# Optimized Model

In [49]:
params = {'bagging_temperature': 0.4,
 'border_count': 128,
 'depth': 7,
 'iterations': 20000,
 'l2_leaf_reg': 1,
 'learning_rate': 0.05,
 'random_strength': 0.1,
 'task_type': 'GPU'}

In [50]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', catboost.CatBoostRegressor(**params))
])


In [51]:
pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)

0:	learn: 9.4238995	total: 11.6ms	remaining: 3m 51s
1:	learn: 9.0028878	total: 22.5ms	remaining: 3m 45s
2:	learn: 8.6046806	total: 36.8ms	remaining: 4m 4s
3:	learn: 8.2265712	total: 50.5ms	remaining: 4m 12s
4:	learn: 7.8677092	total: 63.7ms	remaining: 4m 14s
5:	learn: 7.5268710	total: 77.2ms	remaining: 4m 17s
6:	learn: 7.2049167	total: 90.1ms	remaining: 4m 17s
7:	learn: 6.8990723	total: 103ms	remaining: 4m 18s
8:	learn: 6.6082784	total: 117ms	remaining: 4m 19s
9:	learn: 6.3329523	total: 130ms	remaining: 4m 20s
10:	learn: 6.0735341	total: 144ms	remaining: 4m 21s
11:	learn: 5.8282657	total: 157ms	remaining: 4m 20s
12:	learn: 5.5956835	total: 171ms	remaining: 4m 23s
13:	learn: 5.3742797	total: 185ms	remaining: 4m 23s
14:	learn: 5.1658247	total: 198ms	remaining: 4m 23s
15:	learn: 4.9659629	total: 211ms	remaining: 4m 24s
16:	learn: 4.7771362	total: 226ms	remaining: 4m 25s
17:	learn: 4.5988928	total: 242ms	remaining: 4m 28s
18:	learn: 4.4296893	total: 268ms	remaining: 4m 41s
19:	learn: 4.270

### Accuracy

In [52]:
def accuracy(y_test,y_hat):
    acc = r2_score(y_test, y_hat)
    print("R2 Accuracy:", acc)
    accMSE = mean_squared_error(y_test, y_hat)
    print("MSE Accuracy:", accMSE)
    accMAE = mean_absolute_error(y_test, y_hat)
    print("MAE Accuracy:", accMAE)
    accRMSE = mean_squared_error(y_test, y_hat, squared=False)
    print("RMSE Accuracy:", accRMSE)

In [53]:
accuracy = accuracy(y_test,y_hat)

R2 Accuracy: 0.9997176599438494
MSE Accuracy: 0.027527198871785064
MAE Accuracy: 0.12880222934477334
RMSE Accuracy: 0.16591322693439803


# Submission

In [54]:
prediction = pipe.predict(test)

Feature names must be in the same order as they were in fit.



In [55]:
prediction_df = pd.concat([pd.Series(prediction), test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df.head()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,26.654529,375734
1,26.764611,375735
2,26.834774,375736
3,26.771782,375737
4,26.804726,375738


In [56]:
prediction_df.tail()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
31349,3.802956,407083
31350,3.958872,407084
31351,3.042797,407085
31352,3.407395,407086
31353,3.470453,407087


In [57]:
files = os.listdir('predictions')
if files is not None:
    highest = 0
    for i in files:
        curr = i[10:-4]
        highest = int(curr) if int(curr)>highest else highest

    file_name = f'prediction{highest+1}.csv'
else:
    file_name = 'prediction1.csv'


In [58]:
file_name

'prediction4.csv'

In [59]:
check = prediction_df.shape[0]==31354

In [66]:
check = False #comment out in order to save model

In [60]:
if check:
    prediction_df.to_csv(f"predictions/{file_name}",index=False)
else:
    print('Shape does not match requirements')