# Import Dependencies

In [39]:
# Basics
import pandas as pd #DataFrame
import numpy as np #Linear Algebra

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

#pipeline
from sklearn.pipeline import Pipeline

#os
import os

# Load Data

In [40]:
train = pd.read_csv('new_data/full_train.csv')
test = pd.read_csv('new_data/full_test.csv')

In [41]:
train.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb,date
0,0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,1
2,2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,2
3,3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,3
4,4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,4


In [42]:
test.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,Cfb,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb
0,375734,0.0,0.833333,339.88,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
1,375735,0.0,0.833333,334.63,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
2,375736,0.0,0.833333,337.83,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
3,375737,0.0,0.833333,345.81,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0
4,375738,0.0,0.833333,357.39,30.88,30.92,29.17,31.02,29.47,30.93,...,0,0,0,0,0,0,0,0,0,0


# Split into Train and Validation Data

In [43]:
y = train['contest-tmp2m-14d__tmp2m']
X = train.drop(['contest-tmp2m-14d__tmp2m'],axis=1)

In [44]:
X.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb,date
0,0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,1
2,2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,2
3,3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,3
4,4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,0,0,0,0,0,0,0,0,0,4


In [45]:
y.head()

0    28.744480
1    28.370585
2    28.133059
3    28.256798
4    28.372353
Name: contest-tmp2m-14d__tmp2m, dtype: float64

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3)

# Pipeline

In [47]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBRegressor())
])


# Baseline Model

In [48]:
pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)

### Accuracy

In [49]:
prediction_df = pd.concat([pd.Series(y_hat), X_test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,20.080624,329308
1,13.280202,295719
2,4.287400,300581
3,3.875517,16207
4,12.409850,23915
...,...,...
75142,13.048269,129579
75143,13.839178,68410
75144,5.370245,224253
75145,26.333185,60229


In [50]:
def accuracy(y_test,y_hat):
    acc = r2_score(y_test, y_hat)
    print("R2 Accuracy:", acc)
    accMSE = mean_squared_error(y_test, y_hat)
    print("MSE Accuracy:", accMSE)
    accMAE = mean_absolute_error(y_test, y_hat)
    print("MAE Accuracy:", accMAE)
    accRMSE = mean_squared_error(y_test, y_hat, squared=False)
    print("RMSE Accuracy:", accRMSE)

In [51]:
accuracy = accuracy(y_test,y_hat)

R2 Accuracy: 0.9946822381948968
MSE Accuracy: 0.5184637587654655
MAE Accuracy: 0.560720075540332
RMSE Accuracy: 0.7200442755591252


# Tune Hyperparameters

In [52]:
param_grid = {
    'classifier__learning_rate': [ 0.15, 0.2],
    'classifier__max_depth': [3, 5, 10, 20],
    'classifier__min_child_weight': [1, 3, 5],
    'classifier__gamma': [0, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.7, 0.8],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8],
    'classifier__n_estimators': [200, 300],
    'classifier__tree_method':['gpu_hist']
}


In [53]:
grid = GridSearchCV(pipe, param_grid, verbose=2)

In [54]:
# grid.fit(X_train,y_train)

# Optimized Model

In [73]:
params = {'colsample_bytree': 0.6,
 'gamma': 0,
 'learning_rate': 0.05,
 'max_depth': 20,
 'min_child_weight': 5,
 'n_estimators': 20000,
 'subsample': 0.6,
 'tree_method': 'gpu_hist',
 }

In [56]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBRegressor(**params))
])


In [57]:
pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)



### Accuracy

In [68]:
def accuracy(y_test,y_hat):
    acc = r2_score(y_test, y_hat)
    print("R2 Accuracy:", acc)
    accMSE = mean_squared_error(y_test, y_hat)
    print("MSE Accuracy:", accMSE)
    accMAE = mean_absolute_error(y_test, y_hat)
    print("MAE Accuracy:", accMAE)
    accRMSE = mean_squared_error(y_test, y_hat, squared=False)
    print("RMSE Accuracy:", accRMSE)

In [69]:
accuracy = accuracy(y_test,y_hat)

R2 Accuracy: 0.9995641052913177
MSE Accuracy: 0.04249825723155688
MAE Accuracy: 0.14707319083871034
RMSE Accuracy: 0.20615105440321396


# Submission

In [70]:
prediction = pipe.predict(test)

Feature names must be in the same order as they were in fit.



In [71]:
prediction_df = pd.concat([pd.Series(prediction), test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df.head()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,28.227489,375734
1,28.38328,375735
2,28.335598,375736
3,28.458715,375737
4,28.447557,375738


In [72]:
prediction_df.tail()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
31349,3.180504,407083
31350,3.637101,407084
31351,2.705214,407085
31352,1.781725,407086
31353,1.849773,407087


In [63]:
files = os.listdir('predictions')
if files is not None:
    highest = 0
    for i in files:
        curr = i[10:-4]
        highest = int(curr) if int(curr)>highest else highest

    file_name = f'prediction{highest+1}.csv'
else:
    file_name = 'prediction1.csv'


In [64]:
file_name

'prediction3.csv'

In [74]:
check = prediction_df.shape[0]==31354

In [66]:
check = False #comment out in order to save model

In [76]:
if check:
    prediction_df.to_csv(f"predictions/{file_name}",index=False)
else:
    print('Shape does not match requirements')