In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.stattools import grangercausalitytests

data = pd.read_csv('/content/drive/MyDrive/FYP Data/Main/Dera Ghazi Khan CSV.csv')
data = data.drop(['Year', 'Month'], axis=1)

data

Unnamed: 0,M_Date,Maximum Temperature,Minimum Temperature,Precipitation,Soil Moisture,Wind Speed,Runoff,Flood
0,1/1/1958,22.95,7.13,5.8,1.8,0.81,0.3,False
1,2/1/1958,24.60,8.79,1.6,1.8,1.41,0.1,False
2,3/1/1958,31.00,14.77,1.4,1.7,1.81,0.1,False
3,4/1/1958,38.57,22.08,0.3,1.6,2.10,0.0,False
4,5/1/1958,40.25,23.32,1.4,1.6,2.65,0.1,False
...,...,...,...,...,...,...,...,...
773,6/1/2022,34.40,18.00,110.0,0.1,2.00,0.9,True
774,7/1/2022,32.70,14.80,184.0,0.2,1.20,4.7,True
775,8/1/2022,30.60,13.10,155.0,0.2,0.90,2.9,True
776,9/1/2022,31.20,16.20,2.0,0.1,0.60,2.1,False


In [None]:
data.index = pd.to_datetime(data['M_Date'], format='%m/%d/%Y')
data = data.drop(['M_Date'], axis=1)

le = LabelEncoder()
data['Flood'] = le.fit_transform(data.Flood.values)

data

Unnamed: 0_level_0,Maximum Temperature,Minimum Temperature,Precipitation,Soil Moisture,Wind Speed,Runoff,Flood
M_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1958-01-01,22.95,7.13,5.8,1.8,0.81,0.3,0
1958-02-01,24.60,8.79,1.6,1.8,1.41,0.1,0
1958-03-01,31.00,14.77,1.4,1.7,1.81,0.1,0
1958-04-01,38.57,22.08,0.3,1.6,2.10,0.0,0
1958-05-01,40.25,23.32,1.4,1.6,2.65,0.1,0
...,...,...,...,...,...,...,...
2022-06-01,34.40,18.00,110.0,0.1,2.00,0.9,1
2022-07-01,32.70,14.80,184.0,0.2,1.20,4.7,1
2022-08-01,30.60,13.10,155.0,0.2,0.90,2.9,1
2022-09-01,31.20,16.20,2.0,0.1,0.60,2.1,0


In [None]:
maxlag=12

test = 'ssr_chi2test'
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

grangers_causation_matrix(data, variables = data.columns)

Unnamed: 0,Maximum Temperature_x,Minimum Temperature_x,Precipitation_x,Soil Moisture_x,Wind Speed_x,Runoff_x,Flood_x
Maximum Temperature_y,1.0,0.0,0.0,0.0617,0.0,0.0,0.0
Minimum Temperature_y,0.0,1.0,0.0,0.0861,0.0,0.0,0.0138
Precipitation_y,0.0,0.0,1.0,0.0003,0.0,0.0,0.0
Soil Moisture_y,0.162,0.0761,0.0115,1.0,0.0558,0.0072,0.0
Wind Speed_y,0.0,0.0,0.0,0.0064,1.0,0.0,0.0018
Runoff_y,0.0,0.0,0.0019,0.0009,0.0,1.0,0.0
Flood_y,0.0001,0.0114,0.0001,0.4428,0.0028,0.002,1.0


In [None]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

def cointegration_test(data, alpha=0.05): 
    out = coint_johansen(data,-1,5)
    d = {'0.90':0, '0.95':1, '0.99':2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1-alpha)]]
    def adjust(val, length= 6): return str(val).ljust(length)

    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(data.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)

cointegration_test(data)

Name   ::  Test Stat > C(95%)    =>   Signif  
 ----------------------------------------
Maximum Temperature ::  924.98    > 111.7797  =>   True
Minimum Temperature ::  524.95    > 83.9383   =>   True
Precipitation ::  296.33    > 60.0627   =>   True
Soil Moisture ::  157.96    > 40.1749   =>   True
Wind Speed ::  47.07     > 24.2761   =>   True
Runoff ::  12.52     > 12.3212   =>   True
Flood  ::  0.33      > 4.1296    =>   False


In [None]:
nobs = 150
df_train, df_test = data[0:-nobs], data[-nobs:]

In [None]:
def adfuller_test(series, signif=0.05, name='', verbose=False):
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")

for name, column in df_train.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')

    Augmented Dickey-Fuller Test on "Maximum Temperature" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.2208
 No. Lags Chosen       = 18
 Critical value 1%     = -3.441
 Critical value 5%     = -2.866
 Critical value 10%    = -2.569
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "Minimum Temperature" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.0595
 No. Lags Chosen       = 15
 Critical value 1%     = -3.441
 Critical value 5%     = -2.866
 Critical value 10%    = -2.569
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "Precipitation" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-S

In [None]:
df_train

Unnamed: 0_level_0,Maximum Temperature,Minimum Temperature,Precipitation,Soil Moisture,Wind Speed,Runoff,Flood
M_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1958-01-01,22.95,7.13,5.8,1.8,0.81,0.3,0
1958-02-01,24.60,8.79,1.6,1.8,1.41,0.1,0
1958-03-01,31.00,14.77,1.4,1.7,1.81,0.1,0
1958-04-01,38.57,22.08,0.3,1.6,2.10,0.0,0
1958-05-01,40.25,23.32,1.4,1.6,2.65,0.1,0
...,...,...,...,...,...,...,...
2009-12-01,23.27,5.57,2.6,0.1,0.42,0.1,0
2010-01-01,20.88,4.82,2.6,0.1,0.53,0.1,0
2010-02-01,24.41,9.20,1.2,0.1,1.26,0.1,0
2010-03-01,32.69,16.95,12.9,0.1,1.07,0.6,0


In [None]:
model = VAR(df_train)

x = model.select_order(maxlags=12)
x.summary()



0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-4.473,-4.423,0.01141,-4.453
1.0,-12.93,-12.52,2.433e-06,-12.77
2.0,-14.94,-14.19,3.233e-07,-14.65
3.0,-15.52,-14.41*,1.827e-07,-15.09
4.0,-15.72,-14.26,1.489e-07,-15.15
5.0,-15.94,-14.13,1.196e-07,-15.24*
6.0,-16.02,-13.86,1.108e-07,-15.18
7.0,-16.03,-13.52,1.097e-07,-15.05
8.0,-16.00,-13.13,1.130e-07,-14.89


In [None]:
model_fitted = model.fit(6)
model_fitted.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 04, Jan, 2023
Time:                     03:32:46
--------------------------------------------------------------------
No. of Equations:         7.00000    BIC:                   -13.7948
Nobs:                     622.000    HQIC:                  -15.1063
Log likelihood:          -919.714    FPE:                1.19677e-07
AIC:                     -15.9400    Det(Omega_mle):     7.49537e-08
--------------------------------------------------------------------
Results for equation Maximum Temperature
                            coefficient       std. error           t-stat            prob
-----------------------------------------------------------------------------------------
const                         27.052114         2.602281           10.396           0.000
L1.Maximum Temperature         0.267743         0.059289            4.516           0.000
L1.Minimum

In [None]:
from statsmodels.stats.stattools import durbin_watson
out = durbin_watson(model_fitted.resid)

for col, val in zip(data.columns, out):
    print((col), ':', round(val, 2))

Maximum Temperature : 2.02
Minimum Temperature : 2.11
Precipitation : 2.03
Soil Moisture : 1.99
Wind Speed : 2.01
Runoff : 2.03
Flood : 2.0


In [None]:
lag_order = model_fitted.k_ar
print(lag_order)

forecast_input = df_train.values[-lag_order:]
forecast_input

6


array([[27.94,  9.83,  0.  ,  0.1 ,  0.56,  0.  ,  0.  ],
       [23.27,  5.57,  2.6 ,  0.1 ,  0.42,  0.1 ,  0.  ],
       [20.88,  4.82,  2.6 ,  0.1 ,  0.53,  0.1 ,  0.  ],
       [24.41,  9.2 ,  1.2 ,  0.1 ,  1.26,  0.1 ,  0.  ],
       [32.69, 16.95, 12.9 ,  0.1 ,  1.07,  0.6 ,  0.  ],
       [38.65, 22.24,  3.8 ,  0.1 ,  1.49,  0.2 ,  0.  ]])

In [None]:
fc = model_fitted.forecast(y=forecast_input, steps=nobs)
df_forecast = pd.DataFrame(fc, index=data.index[-nobs:], columns=data.columns + '_2d')
df_forecast

Unnamed: 0_level_0,Maximum Temperature_2d,Minimum Temperature_2d,Precipitation_2d,Soil Moisture_2d,Wind Speed_2d,Runoff_2d,Flood_2d
M_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-05-01,41.267502,25.838463,16.510871,0.098430,1.760385,0.825699,0.035127
2010-06-01,41.231044,27.871749,26.682653,0.096295,1.708170,1.339366,0.016802
2010-07-01,39.276425,27.941333,38.019433,0.099175,1.408654,1.896452,0.045297
2010-08-01,37.015679,25.670226,40.926531,0.096958,0.986333,2.043201,0.064940
2010-09-01,34.998399,21.114320,18.057010,0.098188,0.753984,0.902128,0.040055
...,...,...,...,...,...,...,...
2022-06-01,39.294024,25.769659,24.663235,0.135863,1.839237,1.231582,0.033351
2022-07-01,38.700276,25.242200,25.354640,0.135880,1.658710,1.265672,0.039855
2022-08-01,36.512104,22.624293,23.087669,0.135910,1.415804,1.151698,0.039979
2022-09-01,33.290045,18.608837,18.626289,0.135977,1.175757,0.928039,0.033761


In [None]:
from statsmodels.tsa.stattools import acf
def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    me = np.mean(forecast - actual)             # ME
    mae = np.mean(np.abs(forecast - actual))    # MAE
    mpe = np.mean((forecast - actual)/actual)   # MPE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    mins = np.amin(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax
    return({'mape':mape, 'me':me, 'mae': mae, 
            'mpe': mpe, 'rmse':rmse, 'corr':corr, 'minmax':minmax})

print('Forecast Accuracy of: Maximum Temperature')
accuracy_prod = forecast_accuracy(df_forecast['Maximum Temperature_2d'].values, df_test['Maximum Temperature'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

print('\nForecast Accuracy of: Minimum Temperature')
accuracy_prod = forecast_accuracy(df_forecast['Minimum Temperature_2d'].values, df_test['Minimum Temperature'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

print('\nForecast Accuracy of: Precipitation')
accuracy_prod = forecast_accuracy(df_forecast['Precipitation_2d'].values, df_test['Precipitation'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

print('\nForecast Accuracy of: Soil Moisture')
accuracy_prod = forecast_accuracy(df_forecast['Soil Moisture_2d'].values, df_test['Soil Moisture'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

print('\nForecast Accuracy of: Wind Speed')
accuracy_prod = forecast_accuracy(df_forecast['Wind Speed_2d'].values, df_test['Wind Speed'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

print('\nForecast Accuracy of: Runoff')
accuracy_prod = forecast_accuracy(df_forecast['Runoff_2d'].values, df_test['Runoff'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

print('\nForecast Accuracy of: Flood')
accuracy_prod = forecast_accuracy(df_forecast['Flood_2d'].values, df_test['Flood'])
for k, v in accuracy_prod.items():
    print((k), ': ', round(v,4))

Forecast Accuracy of: Maximum Temperature
mape :  0.0925
me :  0.5077
mae :  2.5364
mpe :  0.0342
rmse :  3.3105
corr :  0.8787
minmax :  0.0803

Forecast Accuracy of: Minimum Temperature
mape :  0.219
me :  0.3628
mae :  2.5295
mpe :  0.111
rmse :  3.1561
corr :  0.9193
minmax :  0.1636

Forecast Accuracy of: Precipitation
mape :  inf
me :  -3.9872
mae :  15.2042
mpe :  inf
rmse :  30.0521
corr :  0.498
minmax :  0.7261

Forecast Accuracy of: Soil Moisture
mape :  0.8077
me :  -1.6686
mae :  1.6719
mpe :  -0.7742
rmse :  2.9033
corr :  -0.2341
minmax :  0.8033

Forecast Accuracy of: Wind Speed
mape :  inf
me :  0.2054
mae :  0.2631
mpe :  inf
rmse :  0.323
corr :  0.8402
minmax :  0.2016

Forecast Accuracy of: Runoff
mape :  inf
me :  -0.1211
mae :  0.6602
mpe :  nan
rmse :  1.232
corr :  0.5325
minmax :  inf

Forecast Accuracy of: Flood
mape :  inf
me :  -0.0236
mae :  0.058
mpe :  nan
rmse :  0.1966
corr :  0.0888
minmax :  inf


  actual[:,None]]), axis=1)
  actual[:,None]]), axis=1)
  minmax = 1 - np.mean(mins/maxs)             # minmax
