In [2]:
import pandas as pd
#import torch
import numpy as np
import matplotlib.pyplot as plt
#print(torch.__version__)
import seaborn as sns
color_pal = sns.color_palette()
import missingno as msno
from scipy.stats import norm 
from scipy import stats
import math

In [None]:
train = pd.read_csv('/kaggle/input/widsdatathon2023/train_data.csv')
# Convert timestamp
train['startdate'] = pd.to_datetime(train.startdate)
train.set_index('startdate', inplace=True)
train.head()

In [None]:
train.columns

In [None]:
test = pd.read_csv('/kaggle/input/widsdatathon2023/test_data.csv')
# Convert timestamp
test['startdate'] = pd.to_datetime(test.startdate)
test.set_index('startdate', inplace=True)
test.head()


In [None]:
train['contest-tmp2m-14d__tmp2m'].plot(style='.',
        figsize=(20, 5),
        color=color_pal[0],
        title='Temperature in °c')
plt.show()

In [None]:
train['contest-precip-14d__precip'].plot(style='.',
        figsize=(20, 5),
        color=color_pal[0],
        title='precipitation')
plt.show()

## Feature creation

In [None]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    #df['dayofweek'] = df.index.dayofweek
    #df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    #df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    #df['weekofyear'] = df.index.isocalendar().week
    return df

In [None]:
train = create_features(train)
train.head()

In [None]:
test = create_features(test)
test.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=train, x='dayofmonth', y='contest-tmp2m-14d__tmp2m')
ax.set_title('Temp by day')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=train, x='month', y='contest-tmp2m-14d__tmp2m', palette = 'Blues')
ax.set_title('Temp by month')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=train, x='year', y='contest-tmp2m-14d__tmp2m', palette = 'Blues')
ax.set_title('Temp by year')
plt.show()

## Missing values

In [None]:
train.isna().sum().sort_values(ascending = False)

In [None]:
# Identify the correlation between the features which have missing values
msno.heatmap(train, cmap="RdYlGn", figsize=(10,5), fontsize=10)

In [None]:
def fill_na(df):
    df = df.ffill()
    return df

train = fill_na(train)
#test = fill_na(test)

In [None]:
# Distribution plot
plt.figure(figsize=(8,5))
sns.set() # for style
sns.distplot(train['contest-tmp2m-14d__tmp2m'] , fit=norm)
plt.title("Histogram of contest-tmp2m-14d__tmp2m") # for histogram title
# probability plot
plt.figure(figsize=(8,5))
res = stats.probplot(train['contest-tmp2m-14d__tmp2m'], plot=plt)
plt.show()

# skewness and kurtosis
print("Skewness: %f" % train['contest-tmp2m-14d__tmp2m'].skew())
print("Kurtosis: %f" % train['contest-tmp2m-14d__tmp2m'].kurt())

## Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder

def categorical_encode(train, test):
    le = LabelEncoder()
    train['climateregions__climateregion'] = le.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
    return train, test

train, test = categorical_encode(train,test)

## Outliers

In [None]:
def outliers(feat):
    upper_limit = feat.mean() + 3*feat.std()
    lower_limit = feat.mean() - 3*feat.std()

    feat = np.where(
        feat >upper_limit,
        upper_limit,
        np.where(
        feat <lower_limit,
            lower_limit,
            feat ))
    return feat

for column in train:
  train[column] = outliers(train[column])

## Scale the data

In [None]:
# col_name = list(train.columns)
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# train_scaled = scaler.fit_transform(train)
# #X_test_minmax =scaler.transform(test)

# train_minmax = pd.DataFrame(train_scaled, columns=col_name)
# #X_test_minmax = pd.DataFrame(X_test_minmax, columns=col_name)

## Splitting the data

In [None]:
X = train.drop('contest-tmp2m-14d__tmp2m', axis =1)
y = train['contest-tmp2m-14d__tmp2m']

from sklearn.model_selection import train_test_split
# train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=False)

In [None]:
print('X_train shape:',X_train.shape)
print('X_test shape:',y_train.shape)
print('X_val shape:',X_val.shape)
print('y_val shape:',y_val.shape)

## Model

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                       n_estimators=80,
                       early_stopping_rounds=5,
                       objective='reg:linear',
                       max_depth=5,
                       learning_rate=0.02)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=100)

In [None]:
# Plot the training and validation losses  

results = reg.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Validation')
ax.legend()
plt.ylabel('RMSE')
plt.title('XGBRegressor Loss')
plt.show()

In [None]:
preds = reg.predict(test)
print('Predicted values', preds)

## Feature importance

In [None]:
import matplotlib.pyplot as plt
feature_importance = reg.feature_importances_
max_features = 50
sorted_idx = np.argsort(feature_importance)[-max_features:]
fig = plt.figure(figsize=(8, 12))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_val.columns)[sorted_idx])
plt.title('Feature Importance')

## Submission

In [None]:
sub = pd.read_csv('/kaggle/input/widsdatathon2023/sample_solution.csv')
sub['contest-tmp2m-14d__tmp2m'] = preds
sub.to_csv('/kaggle/input/widsdatathon2023/submission.csv' ,index = False)

In [None]:
sub