# Import Dependencies

In [55]:
# Basics
import pandas as pd #DataFrame
import numpy as np #Linear Algebra
from pandas.api.types import is_numeric_dtype
import datetime
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor
import catboost
#metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

#pipeline
from sklearn.pipeline import Pipeline

#os
import os

# Load Data

In [17]:
train = pd.read_csv('new_data/train.csv')
features = [col for col in train.columns if is_numeric_dtype(train[col])]
test = pd.read_csv('new_data/test.csv')

In [21]:
train = train[features]

In [23]:
features = [col for col in test.columns if is_numeric_dtype(test[col])]
test = test[features]

In [24]:
train.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


In [25]:
test.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,375734,0.0,0.833333,339.88,30.88,30.92,29.17,31.02,29.47,30.93,...,-19.28,-39.77,-29.25,40.88,-8.31,14.91,-24.62,31.05,-23.69,6.27
1,375735,0.0,0.833333,334.63,30.88,30.92,29.17,31.02,29.47,30.93,...,-19.58,-43.14,-28.62,45.37,-5.42,16.97,-23.94,28.84,-20.61,14.16
2,375736,0.0,0.833333,337.83,30.88,30.92,29.17,31.02,29.47,30.93,...,-13.73,-44.22,-27.67,49.76,-1.31,21.44,-19.06,26.85,-16.78,13.42
3,375737,0.0,0.833333,345.81,30.88,30.92,29.17,31.02,29.47,30.93,...,-7.97,-49.47,-19.32,52.62,-0.44,21.65,-23.12,23.7,-18.62,10.69
4,375738,0.0,0.833333,357.39,30.88,30.92,29.17,31.02,29.47,30.93,...,-0.8,-56.07,-9.89,51.23,-7.57,19.86,-30.56,20.66,-25.08,19.64


# Split into Train and Validation Data

In [26]:
y = train['contest-tmp2m-14d__tmp2m']
X = train.drop(['contest-tmp2m-14d__tmp2m'],axis=1)

In [27]:
X.head()

Unnamed: 0,index,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


In [28]:
y.head()

0    28.744480
1    28.370585
2    28.133059
3    28.256798
4    28.372353
Name: contest-tmp2m-14d__tmp2m, dtype: float64

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3)

# Pipeline

In [41]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', catboost.CatBoostRegressor())
])


# Baseline Model

In [42]:
pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)

Learning rate set to 0.100855
0:	learn: 8.9960031	total: 347ms	remaining: 5m 46s
1:	learn: 8.2086925	total: 481ms	remaining: 4m
2:	learn: 7.5017985	total: 606ms	remaining: 3m 21s
3:	learn: 6.8759188	total: 731ms	remaining: 3m 2s
4:	learn: 6.3178592	total: 856ms	remaining: 2m 50s
5:	learn: 5.8174695	total: 969ms	remaining: 2m 40s
6:	learn: 5.3655609	total: 1.1s	remaining: 2m 35s
7:	learn: 4.9661144	total: 1.23s	remaining: 2m 32s
8:	learn: 4.6077196	total: 1.35s	remaining: 2m 28s
9:	learn: 4.2823632	total: 1.46s	remaining: 2m 24s
10:	learn: 3.9963629	total: 1.58s	remaining: 2m 21s
11:	learn: 3.7438538	total: 1.69s	remaining: 2m 19s
12:	learn: 3.5229139	total: 1.81s	remaining: 2m 17s
13:	learn: 3.3179588	total: 1.93s	remaining: 2m 15s
14:	learn: 3.1415330	total: 2.04s	remaining: 2m 13s
15:	learn: 2.9793123	total: 2.16s	remaining: 2m 12s
16:	learn: 2.8361238	total: 2.28s	remaining: 2m 12s
17:	learn: 2.7080245	total: 2.39s	remaining: 2m 10s
18:	learn: 2.5940117	total: 2.5s	remaining: 2m 9s


### Accuracy

In [43]:
prediction_df = pd.concat([pd.Series(y_hat), X_test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,20.054153,329308
1,12.387040,295719
2,4.208315,300581
3,3.664922,16207
4,12.583979,23915
...,...,...
75142,13.568841,129579
75143,14.107631,68410
75144,5.554737,224253
75145,25.890036,60229


In [44]:
acc = r2_score(y_test, y_hat)
print("Accuracy:", acc)

Accuracy: 0.9969097696898827


In [50]:
accRMSE = mean_squared_error(y_test, y_hat, squared=False)
print("Accuracy:", accRMSE)

Accuracy: 0.5488961515218181


In [46]:
accMAE = mean_absolute_error(y_test, y_hat)
print("Accuracy:", accMAE)

Accuracy: 0.42943275298929645


# Submission

In [47]:
prediction = pipe.predict(test)

In [48]:
prediction_df = pd.concat([pd.Series(prediction), test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df.head()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,26.17343,375734
1,26.115324,375735
2,26.155679,375736
3,26.374494,375737
4,26.518237,375738


In [49]:
prediction_df.tail()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
31349,5.048506,407083
31350,4.395699,407084
31351,3.879957,407085
31352,4.093185,407086
31353,4.240641,407087


In [53]:
files = os.listdir('predictions')
if files is not None:
    highest = 0
    for i in files:
        curr = i[10:-4]
        highest = int(curr) if int(curr)>highest else highest

    file_name = f'prediction{highest+1}.csv'
else:
    file_name = 'prediction1.csv'


In [54]:
file_name

'prediction1.csv'

In [None]:
# prediction_df.to_csv(f"predictions/{file_name}")