# Import Dependencies

In [1]:
# Basics
import pandas as pd #DataFrame
import numpy as np #Linear Algebra
import scipy as sp #Scientific Computing
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

#pipeline
from sklearn.pipeline import Pipeline

#os
import os

# Load Data

In [4]:
train = pd.read_csv('new_data/train.csv')
test = pd.read_csv('new_data/test.csv')

In [5]:
train.head()

Unnamed: 0,index,contest-tmp2m-14d__tmp2m,nmme0-tmp2m-34w__nmme0mean,nmme0-tmp2m-34w__nasa0,wind-uwnd-250-2010-1,wind-uwnd-925-2010-1,nasa0,nmme0mean,nmme-tmp2m-34w__nmmemean,nmme-tmp2m-34w__nasa,...,wind-hgt-850-2010-7,icec-2010-2,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-h850-14d__wind-hgt-850,wind-uwnd-250-2010-18,wind-hgt-500-2010-9,sst-2010-3,wind-vwnd-250-2010-7,wind-hgt-10-2010-4,wind-hgt-10-2010-5
0,0,28.74448,30.46,29.66,628.66,143.64,29.51,30.65,28.17,28.55,...,77.17,0.97,-3.52,1535.52,35.85,1549.2,-19.69,-125.92,-4560.34,7128.13
1,1,28.370585,30.46,29.66,615.58,143.41,29.51,30.65,28.17,28.55,...,77.48,0.97,-4.49,1538.0,28.76,1667.04,-20.66,-109.57,-5318.37,8109.37
2,2,28.133059,30.46,29.66,602.14,145.35,29.51,30.65,28.17,28.55,...,116.42,0.97,-5.44,1540.32,24.45,1743.11,-21.34,-88.66,-5947.6,9137.52
3,3,28.256798,30.46,29.66,589.63,146.9,29.51,30.65,28.17,28.55,...,183.76,0.97,-5.76,1541.1,20.95,1746.57,-21.92,-77.34,-6369.78,10010.87
4,4,28.372353,30.46,29.66,576.23,148.67,29.51,30.65,28.17,28.55,...,264.03,0.97,-6.09,1539.73,11.62,1698.71,-22.54,-78.18,-6673.71,10458.26


In [6]:
test.head()

Unnamed: 0,index,nmme0-tmp2m-34w__nmme0mean,nmme0-tmp2m-34w__nasa0,wind-uwnd-250-2010-1,wind-uwnd-925-2010-1,nasa0,nmme0mean,nmme-tmp2m-34w__nmmemean,nmme-tmp2m-34w__nasa,nmme-tmp2m-34w__gfdlflorb,...,wind-hgt-850-2010-7,icec-2010-2,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-h850-14d__wind-hgt-850,wind-uwnd-250-2010-18,wind-hgt-500-2010-9,sst-2010-3,wind-vwnd-250-2010-7,wind-hgt-10-2010-4,wind-hgt-10-2010-5
0,375734,30.37,29.39,675.14,162.81,29.28,30.77,28.06,27.94,27.83,...,-1064.16,0.97,-4.82,1549.82,142.32,-529.25,-19.05,-211.71,-5435.01,7408.07
1,375735,30.37,29.39,671.74,161.69,29.28,30.77,28.06,27.94,27.83,...,-828.01,0.97,-4.34,1550.55,140.27,-438.92,-19.2,-216.37,-5446.56,6913.6
2,375736,30.37,29.39,666.01,159.32,29.28,30.77,28.06,27.94,27.83,...,-601.71,0.97,-3.62,1550.16,137.79,-325.51,-19.35,-204.15,-5740.4,6445.81
3,375737,30.37,29.39,660.21,157.58,29.28,30.77,28.06,27.94,27.83,...,-490.6,0.97,-2.8,1550.15,134.52,-274.45,-19.38,-189.01,-6151.83,5895.94
4,375738,30.37,29.39,654.84,154.62,29.28,30.77,28.06,27.94,27.83,...,-396.34,0.97,-2.59,1551.19,126.65,-279.28,-19.27,-182.05,-6527.22,5383.52


# Split into Train and Validation Data

In [7]:
y = train['contest-tmp2m-14d__tmp2m']
X = train.drop(['contest-tmp2m-14d__tmp2m'],axis=1)

In [8]:
X.head()

Unnamed: 0,index,nmme0-tmp2m-34w__nmme0mean,nmme0-tmp2m-34w__nasa0,wind-uwnd-250-2010-1,wind-uwnd-925-2010-1,nasa0,nmme0mean,nmme-tmp2m-34w__nmmemean,nmme-tmp2m-34w__nasa,nmme-tmp2m-34w__gfdlflorb,...,wind-hgt-850-2010-7,icec-2010-2,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-h850-14d__wind-hgt-850,wind-uwnd-250-2010-18,wind-hgt-500-2010-9,sst-2010-3,wind-vwnd-250-2010-7,wind-hgt-10-2010-4,wind-hgt-10-2010-5
0,0,30.46,29.66,628.66,143.64,29.51,30.65,28.17,28.55,28.3,...,77.17,0.97,-3.52,1535.52,35.85,1549.2,-19.69,-125.92,-4560.34,7128.13
1,1,30.46,29.66,615.58,143.41,29.51,30.65,28.17,28.55,28.3,...,77.48,0.97,-4.49,1538.0,28.76,1667.04,-20.66,-109.57,-5318.37,8109.37
2,2,30.46,29.66,602.14,145.35,29.51,30.65,28.17,28.55,28.3,...,116.42,0.97,-5.44,1540.32,24.45,1743.11,-21.34,-88.66,-5947.6,9137.52
3,3,30.46,29.66,589.63,146.9,29.51,30.65,28.17,28.55,28.3,...,183.76,0.97,-5.76,1541.1,20.95,1746.57,-21.92,-77.34,-6369.78,10010.87
4,4,30.46,29.66,576.23,148.67,29.51,30.65,28.17,28.55,28.3,...,264.03,0.97,-6.09,1539.73,11.62,1698.71,-22.54,-78.18,-6673.71,10458.26


In [9]:
y.head()

0    28.744480
1    28.370585
2    28.133059
3    28.256798
4    28.372353
Name: contest-tmp2m-14d__tmp2m, dtype: float64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3)

# Pipeline

In [11]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBRegressor())
])


# Baseline Model

In [12]:
pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)

### Accuracy

In [13]:
prediction_df = pd.concat([pd.Series(y_hat), X_test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,19.443832,329308
1,13.300938,295719
2,3.329164,300581
3,4.362996,16207
4,13.106089,23915
...,...,...
75142,12.675848,129579
75143,14.128506,68410
75144,5.357001,224253
75145,25.906443,60229


In [24]:
def accuracy(y_test,y_hat):
    acc = r2_score(y_test, y_hat)
    print("R2 Accuracy:", acc)
    accMSE = mean_squared_error(y_test, y_hat)
    print("MSE Accuracy:", accMSE)
    accMAE = mean_absolute_error(y_test, y_hat)
    print("MAE Accuracy:", accMAE)
    accRMSE = mean_squared_error(y_test, y_hat, squared=False)
    print("RMSE Accuracy:", accRMSE)

In [25]:
acc = accuracy(y_test, y_hat)

R2 Accuracy: 0.9926440040830077
MSE Accuracy: 0.7171846788863872
MAE Accuracy: 0.6565045211178339
RMSE Accuracy: 0.8468675686826053


# Submission

In [16]:
prediction = pipe.predict(test)

In [23]:
prediction_df = pd.concat([pd.Series(prediction), test['index'].reset_index(drop=True)],axis=1)
index = ['contest-tmp2m-14d__tmp2m', 'index']
prediction_df.columns = index
prediction_df.head()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,28.230339,375734
1,28.255669,375735
2,28.236938,375736
3,28.63356,375737
4,29.178391,375738


In [26]:
check = prediction_df.shape[0]==31354

In [29]:
files = os.listdir('predictions')
if files is not None:
    highest = 0
    for i in files:
        curr = i[10:-4]
        highest = int(curr) if int(curr)>highest else highest

    file_name = f'prediction{highest+1}.csv'
else:
    file_name = 'prediction1.csv'


In [30]:
file_name

'prediction1.csv'

In [32]:
if check:
    prediction_df.to_csv(f"predictions/{file_name}",index=False)
else:
    print('Shape does not match requirements')