In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Import Libraries

import pandas as pd
import numpy as np

#Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#plt.style.use('seaborn-dark')

#DateTime
import datetime as dt

#Models
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

#Sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler

#Time to run Program
import time 

## Load Data

In [None]:
def load_data():
  '''
  Function to Load the Train, Test and Submission Data

  returns: train, test, submission dataframes
  '''  

  train = pd.read_csv('/kaggle/input/widsdatathon2023/train_data.csv')
  test = pd.read_csv('/kaggle/input/widsdatathon2023/test_data.csv')
  submission = pd.read_csv('/kaggle/input/widsdatathon2023/sample_solution.csv')

  return train, test, submission

In [None]:
#Declare Target and Feature
TARGET = 'contest-tmp2m-14d__tmp2m'
feature = ['date']

In [None]:
train, test, submission = load_data()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
#RMSE
def rmse():
  y_pred = train.iloc[10000:11322, 2]
  y = train.iloc[10000:11322, 0]
  metric = np.sqrt(mean_squared_error(y, y_pred))
  print(f"RMSE of Data is: {metric}")

#Hackathon Metric
def predict(model, model_features):
  pred_train = model.predict(X_train[model_features])
  pred_val = model.predict(X_val[model_features])

  print(f"Train RMSE = {np.sqrt(mean_squared_error(y_train, pred_train))}")
  print(f"Test RMSE = {np.sqrt(mean_squared_error(y_val, pred_val))}")

def run_gradient_boosting(clf, fit_params, train, test, features):
  N_SPLITS = 5
  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  target = train[TARGET]

  folds = StratifiedKFold(n_splits = N_SPLITS)
  stratified_target = pd.qcut(train[TARGET], 10, labels = False, duplicates='drop')

  feature_importances = pd.DataFrame()

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], target.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], target.iloc[val_idx]

    ### Test Set
    X_test = test[features]

    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    _ = clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], **fit_params)

    fold_importance = pd.DataFrame({'fold': fold_ + 1, 'feature': features, 'importance': clf.feature_importances_})
    feature_importances = pd.concat([feature_importances, fold_importance], axis=0)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict(X_val)
    preds_test = clf.predict(X_test)

    fold_score = metric(y_val, preds_val)
    print(f'\nRMSE score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = metric(target, oofs)
  print(f'\n\nRMSE for oofs is {oofs_score}')

  feature_importances = feature_importances.reset_index(drop = True)
  fi = feature_importances.groupby('feature')['importance'].mean().sort_values(ascending = False)[:20][::-1]
  fi.plot(kind = 'barh', figsize=(12, 6))

  return oofs, preds, fi

def metric(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

def download_preds(preds_test, file_name = 'hacklive_sub.csv'):

  ## 1. Setting the target column with our obtained predictions
  submission['prediction'] = preds_test

  ## 2. Saving our predictions to a csv file

  submission.to_csv(file_name, index = False)

  ## 3. Downloading and submitting the csv file
  from google.colab import files
  files.download(file_name)

#Download Submission File
def download(model, model_features, file_name = 'prophet.csv'):

  pred_test = model.predict(model_features)

  #Setting the target column with our obtained predictions
  submission['prediction'] = pred_test

  #Saving our predictions to a csv file
  submission.to_csv(file_name, index = False)
  
  #Downloadingthe csv file
  files.download(file_name)

def join_df(train, test):

  df = pd.concat([train, test], axis=0).reset_index(drop = True)
  features = [c for c in df.columns if c not in [feature, TARGET]]
  df[TARGET] = df[TARGET].apply(lambda x: np.log1p(x))

  return df, features

def split_df_and_get_features(df, train_nrows):

  train, test = df[:train_nrows].reset_index(drop = True), df[train_nrows:].reset_index(drop = True)
  features = [c for c in train.columns if c not in [feature, TARGET]]
  
  return train, test, features

## EDA and Data Preprocessing

In [None]:
#Combine Train and Test Dataframe
df, features = join_df(train, test)

In [None]:
df.head()

### Data Details

In [None]:
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")

In [None]:
train.describe()

In [None]:
#Check Datatypes
train.dtypes

### Null Values

In [None]:
print(f"Train Null Value Count: {train.isnull().sum()}")
print(f"Test Null Value Count: {test.isnull().sum()}")

### Target Distribution

In [None]:
#Temperature Distribution
train[TARGET].plot(kind = 'density', title = 'Temperature Distribution', fontsize=14, figsize=(10, 6))

In [None]:
#Log Temperature Distribution
_ = pd.Series(np.log1p(train[TARGET])).plot(kind = 'density', title = 'Log Temperature Distribution', fontsize=14, figsize=(10, 6))

In [None]:
#Temperature Boxplot
train[TARGET].plot(kind = 'box', vert=False, figsize=(12, 4), title = 'Temperature Boxplot', fontsize=14)

In [None]:
#Log Temperature BoxPlot
pd.Series(np.log1p(train[TARGET])).plot(kind = 'box', vert=False, figsize=(12, 4), title = 'Log Temperature Boxplot', fontsize=14)

## Date Feature

In [None]:
#Convert `date` column datatype to `datetime`
df['startdate'] = pd.to_datetime(df['startdate'])

df.dtypes

In [None]:
print(f"Train Null Value Count: {train.isnull().sum()}")
print(f"Test Null Value Count: {test.isnull().sum()}")

In [None]:
#Make basic datetime features
# df['day_of_week'] = df['date'].dt.dayofweek
df['year'] = df['startdate'].dt.year
df['month'] = df['startdate'].dt.month
df['week'] = df['startdate'].dt.isocalendar().week  

#Get Train and Test sets from df
train, test, features = split_df_and_get_features(df, train.shape[0])

#Define the features
features = [c for c in df.columns if c not in [feature, TARGET]]
features = features[1:]
features

In [None]:
df.head()

## Model

In [None]:
#Declare Features and Target from Training Dataset
X = train[features]
y = train[TARGET]

#Split Training and Validation Datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
X.shape, y.shape

In [None]:
#LGBMRegressor
model = LGBMRegressor(n_estimators = 5000,
                        learning_rate = 0.01,
                        colsample_bytree = 0.76,
                        metric = 'None',
                        )
fit_params = {'verbose': 300, 'early_stopping_rounds': 200, 'eval_metric': 'rmse'}

lgb_oofs, lgb_preds, fi = run_gradient_boosting(clf = model, fit_params = fit_params, train = train, test = test, features = features)

## Time Series Forecasting using ARIMA and SARIMAX



### Preprocess Data

In [None]:
#Load Data
train, test, submission = load_data()

In [None]:
#Convert `date` column to datetime
train.startdate = pd.to_datetime(train.startdate)

In [None]:
train.head()

In [None]:
train.describe()

### Visualize Data

In [None]:
train.plot(figsize = (20, 10))

### Make Data Stationary

In [None]:
#Import adfuller test
from statsmodels.tsa.stattools import adfuller

In [None]:
#H0: It is not stationary
#H1: It is stationary

def adfuller_test(temp):
    result=adfuller(temp)
    labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations Used']
    for value,label in zip(result,labels):
        print(label+' : '+str(value) )
    if result[1] <= 0.05:
        print("strong evidence against the null hypothesis(Ho), reject the null hypothesis. Data has no unit root and is stationary")
    else:
        print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")

In [None]:
adfuller_test(train['contest-tmp2m-14d__tmp2m'])

In [None]:
train['Seasonal First Difference']=train['contest-tmp2m-14d__tmp2m']-train['contest-tmp2m-14d__tmp2m'].shift(12) #Because 1 year has 12 months

## Again test dickey fuller test
adfuller_test(train['Seasonal First Difference'].dropna())

train['Seasonal First Difference'].plot()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf ,plot_acf

In [None]:
fig = plt.figure(figsize = (12, 8))
ax1 = fig.add_subplot(211)
fig = plot_pacf(train['Seasonal First Difference'].iloc[13:],lags=40,ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_acf(train['Seasonal First Difference'].iloc[13:],lags=40,ax=ax2)

In [None]:
#from statsmodels.tsa.arima_model import ARIMA

model=sm.tsa.statespace.ARIMA(train['contest-tmp2m-14d__tmp2m'],order=(2,0,2))
model_fit=model.fit()

model_fit.summary()

train['forecast']=model_fit.predict(start=10000,end=11321,dynamic=True)
train[['contest-tmp2m-14d__tmp2m','forecast']].plot(figsize=(12,8))

In [None]:
import statsmodels.api as sm

#Start time 
begin = time.time() 

model=sm.tsa.statespace.SARIMAX(train['contest-tmp2m-14d__tmp2m'],order=(2, 1, 2),seasonal_order=(2, 1, 2, 12))
results=model.fit()

#End TIme
end = time.time()
print(f"\n\nTime of execution = {end - begin}")

#Forecast
train['forecast']=results.predict(start=10000,end=11321,dynamic=True)
train[['contest-tmp2m-14d__tmp2m','forecast']].plot(figsize=(12,8))

rmse()

In [None]:
df = pd.concat([train, test])
df['forecast'] = results.predict(start = 11322, end = 14883, dynamic= True)  
df[['contest-tmp2m-14d__tmp2m', 'forecast']].plot(figsize=(12, 8))

In [None]:
rmse()