# overview of aqua systems and first interpretation

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Import
aq_auser = pd.read_csv("../input/acea-water-prediction/Aquifer_Auser.csv")
aq_doganella = pd.read_csv("../input/acea-water-prediction/Aquifer_Doganella.csv")
aq_luco = pd.read_csv("../input/acea-water-prediction/Aquifer_Luco.csv")
aq_petrignago = pd.read_csv("../input/acea-water-prediction/Aquifer_Petrignano.csv")
lakeBilancino = pd.read_csv("../input/acea-water-prediction/Lake_Bilancino.csv")
riverArno = pd.read_csv("../input/acea-water-prediction/River_Arno.csv")
ws_Amiata = pd.read_csv("../input/acea-water-prediction/Water_Spring_Amiata.csv")
ws_Lupa = pd.read_csv("../input/acea-water-prediction/Water_Spring_Lupa.csv")
ws_Madonna = pd.read_csv("../input/acea-water-prediction/Water_Spring_Madonna_di_Canneto.csv")


In [None]:
riverArno

# prepare data
- split date
- copy data with timelag 3 days

In [None]:
def datumsplit(data,datumvar,isnafilter,diff,pref):


    #drop empty data, loosing that info leaves us with the available data
    data=data[data[isnafilter].isna()==False].reset_index()    
    #split date in components for regression, this way one can discover seasonal, week, year effects
    column_1=pd.to_datetime(data[datumvar],format='%d/%m/%Y')
    temp=pd.DataFrame({pref+"year": column_1.dt.year,
                  pref+"month": column_1.dt.month,
                  pref+"day": column_1.dt.day,
                  #"hour": column_1.dt.hour,
                  #pref+"dayofyear": column_1.dt.dayofyear,
                  pref+"week": column_1.dt.week,
                  pref+"weekofyear": column_1.dt.weekofyear,
                  pref+"dayofweek": column_1.dt.dayofweek,
                  pref+"weekday": column_1.dt.weekday,
                  pref+"quarter": column_1.dt.quarter,
                 })


    #drop date since you cannot regress a date
    datalabel=data[isnafilter]
    data=data.drop([datumvar,isnafilter],axis=1)
    #move the information of (diff=1) yesterday one day foreward and add  the columns
    data2=data[diff:]
    for ki in data.columns:
        data2[ki+'_lag'+str(diff)]=data[:-diff][ki].values
    #add splitted date
    for ki in temp.columns:
        data2[ki]=temp[diff:][ki].values  
    #add moving average to dataset (moving average filters noise)
    temp=data.rolling(diff*5,).mean()
    for ki in data.columns:
        data2[ki+'_MA5*'+str(diff)]=temp[:-diff][ki].values
    data2[isnafilter]=datalabel[diff:]
    print(data.shape,data2.shape)
    return data2

train=datumsplit(riverArno,'Date','Hydrometry_Nave_di_Rosano',1,'ext_')
train

# ARIMA Timeseries analysis
- there is a month effect
- there is a 1-5 day lag effect autoc
- autocorrelation is nonstationary, needs difference per day to do an ARIMA forecast
- indeed the difference of the seasonal gives the better forecast 
- a week effect in the forecast seems to me rather intuitive, since the weekly structure of our habits, but the monthly effect is for me rather counterintuitive. I wonder what habit could be monthly...
- ARIMA does not use any other information than the production itself, and just uses the 'monthly' and '5day' peak effets to forecast the consumption until 14days ahead. Its not such a strong forecaster but as such you can always use it


In [None]:
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    from statsmodels.graphics.tsaplots import plot_acf
    from statsmodels.graphics.tsaplots import plot_pacf
    from statsmodels.tsa.stattools import adfuller
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    

    label='Hydrometry_Nave_di_Rosano'

    isnafilter='Rainfall_Le_Croci'
    # Find  by Date/Time not na
    train=riverArno[riverArno[isnafilter].isna()==False]

    day_df = train[['Date',label]]
    # setting Date/Time as index
    day_df.index = pd.DatetimeIndex(day_df.Date)
    # Resampling to daily trips
    day_df = day_df.resample('1D').apply(np.sum)

    day_df.plot()


In [None]:
##Checking trend and autocorrelation
def initial_plots(time_series, num_lag):
    import matplotlib.pyplot as plt

    #Original timeseries plot
    plt.figure(1)
    plt.plot(time_series)
    plt.title('Original data across time')
    plt.figure(2)
    plot_acf(time_series, lags = num_lag)
    plt.title('Autocorrelation plot')
    plot_pacf(time_series, lags = num_lag)
    plt.title('Partial autocorrelation plot')
    
    plt.show()

    
#Augmented Dickey-Fuller test for stationarity
#checking p-value
print('p-value: {}'.format(adfuller(day_df)[1]))

#plotting
initial_plots(day_df, 45)

In [None]:
#storing differenced series
diff_series = day_df.diff(periods=1)

#Augmented Dickey-Fuller test for stationarity
#checking p-value
print('p-value: {}'.format(adfuller(diff_series.dropna())[1]))


initial_plots(diff_series.dropna(), 45)

# check if you don't create a selffulfilling prophecy
- this means you have to take care that you don't forecast with data you could impossibly know at the day 5, you can not know what is the temperature of day 6, and if the moving average has even a slight knowledge of the day ahaid, this narrows the error of the forecast, but  your forecast is wrong..


In [None]:
train

In [None]:
    if True:
        !pip install dabl
        import dabl
        #data = dabl.clean(train, verbose=1)
        #dabl.plot(data.drop('Hydrometry_Nave_di_Rosano',axis=1), data['Hydrometry_Nave_di_Rosano'])
        #model = dabl.SimpleClassifier(random_state=0).fit(data.fillna(0), target_col="Hydrometry_Nave_di_Rosano")

In [None]:
#train[['index','Rainfall_Le_Croci','Rainfall_Le_Croci_1']]

# lets create an aquaforecast function
- its a forecast that works as long as you have all variables available
- althoug beware : temperature, waterlevels can mere be an estimate of historical values so maybe we need to drop all those values and fill in historical values for everything

In [None]:

def aquaforecast(aqua,label,indxvar,dropvalue,diff,dayahead):
    # prepare data with datesplit function see above
    train=datumsplit(aqua,'Date',dropvalue,diff,'ext_')
    # split train test with number of day ahead
    if True:
        #!pip install dabl
        #import dabl
        train = dabl.clean(train, verbose=1)
        dabl.plot(train.drop(label,axis=1), train[label])
        #model = dabl.SimpleClassifier(random_state=0).fit(data.fillna(0), target_col="Hydrometry_Nave_di_Rosano") 
        
    test=train[-dayahead:]
    train=train[:-dayahead] 
    print('Availbable columns to regress',aqua.columns)
    param = {'num_leaves': 200,   # increasing or decreasing can influence the decisiontree and the result, there is optimum possible beware overfit
             'min_data_in_leaf': 50, # increasing or decreasing can influence the decisiontree and the result, there is optimum possible beware overfit
             'objective':'regression', #regress
             'max_depth': -1,
             'learning_rate': 0.1,
             "boosting": "gbdt", #gbdt,dart  #gradient boost 
             "feature_fraction": 0.8,
             "bagging_freq": 1,
             "bagging_fraction": 0.8 ,
             "bagging_seed": 11,
             "metric": 'mse', #'rmse',#auc  #use mean square error
             "num_classes": 1,#classific
             "lambda_l1": 0.1,

             "random_state": 133,
             "verbosity": -1}

    max_iter = 10
    from sklearn import metrics
    import lightgbm as lgb
    from sklearn.model_selection import KFold
    import warnings
    import gc
    import time
    import sys
    import datetime
    import matplotlib.pyplot as plt
    import seaborn as sns
    from tqdm import tqdm
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_error

    target = train[[label]]
    # drop column to forecast
    train=train.drop([label],axis=1)

    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros((len(train)))

    features = [c for c in train.columns if c not in ['ID']]
    predictions = np.zeros(len(test))
    start = time.time()
    feature_importance_df = pd.DataFrame()
    start_time= time.time()
    score = [0 for _ in range(folds.n_splits)]
    # folding takes care of 'unbalanced data', is always a strong idea to use
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                               label=target.iloc[trn_idx]#.fillna('')#,
                               
                              )
        val_data = lgb.Dataset(train.iloc[val_idx][features],
                               label=target.iloc[val_idx]#.fillna('')#,
                               
                              )
        print(train.shape,target.shape,target.iloc[val_idx].shape,train.iloc[val_idx][features].shape,val_data,trn_data)
        num_round = 20000
        clf = lgb.train(param,
                        trn_data,
                        num_round,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=1000,
                        early_stopping_rounds = 100)

        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        # we perform predictions by chunks
        initial_idx = 0
        chunk_size = 1000000
        current_pred = np.zeros(len(test))
        while initial_idx < test.shape[0]:
            final_idx = min(initial_idx + chunk_size, test.shape[0])
            idx = range(initial_idx, final_idx)
            current_pred[idx] = clf.predict(test.iloc[idx][features], num_iteration=clf.best_iteration)
            initial_idx = final_idx
        predictions += current_pred / min(folds.n_splits, max_iter)
        plt.scatter(x=0,y=0)
        plt.scatter(x=test[label], y=current_pred, marker='.', alpha=1,c=np.abs(test[label].values-current_pred))
        plt.scatter(x=[np.mean(test[label])], y=[np.mean(current_pred)], marker='o', color='red')
        plt.xlabel('Real test'); plt.ylabel('Pred. test')
        plt.show()

        print("time elapsed: {:<5.2}s".format((time.time() - start_time) / 3600))
        #score[fold_] = metrics.roc_auc_score(target.iloc[val_idx], oof[val_idx])
        #mse = mean_squared_error(target.iloc[val_idx], oof[val_idx])
        #print(mse)

        if fold_ == max_iter - 1: break

    if (folds.n_splits == max_iter):
        print("CV score: {:<8.5f}".format(metrics.roc_auc_score(target, oof)))
    else:
         print("CV score: {:<8.5f}".format(sum(score) / max_iter))

    sub_df = pd.DataFrame({indxvar: test[indxvar].values})
    sub_df[label] =predictions
    sub_df[:10]
    sub_df.to_csv("submit.csv", index=False)
    test[label],predictions
    mse=mean_squared_error(predictions, test[label].fillna(0).values)
    print( 'mse',mse, 'rmse',sqrt(mse) )
    
    lgb.plot_importance(
        clf, 
        max_num_features=20, 
        importance_type='gain', 
        figsize=(12,8));
    return

# riverbed Arno


this system is autocorrelated as all forecasts, lag en moving average is always the best forecast, but has as drawback that you feed in during the 400day period information you can not know on forehand. You could know the forecasted data though, which gives a huge autocorrelation. This means the autocorrelation means that the best forecast for next day or even this week is perfectly narrowly possible, but 400days ahead you can't know the fluctuations in the system on forehand. So you simply expect the outflow to be what is was the day before.


first regression is with dropout of Hydrometry_Nave_di_Rosano label Moving average and Lagged information, what gives a rather good forecast of the riversystem. The best forecast is the 'week effet', temperature, rainfall Moving Average and month effect as expected.
But when one adds the 'lagged information' of the riverbed, so you can use this forecast for week-ahead but not much further


In [None]:
aquaforecast(riverArno,'Hydrometry_Nave_di_Rosano' ,"index",'Hydrometry_Nave_di_Rosano',1,400)

In [None]:
aquaforecast(riverArno,'Hydrometry_Nave_di_Rosano' ,"index",'Rainfall_Le_Croci',1,400)

# aqua petrignago

here there are two variables Depth to groudwater p24,p25
evidently again you drop out the other variable to get a forecast without autocorrelation effect
the test / prediction graph shows the great variability of the forecast. Although this fuzziness, the forecast is still good


In [None]:
aquaforecast(aq_petrignago.drop('Depth_to_Groundwater_P25',axis=1),'Depth_to_Groundwater_P24' ,"index",'Depth_to_Groundwater_P24',1,400)

In [None]:
aquaforecast(aq_petrignago,'Depth_to_Groundwater_P24' ,"index",'Depth_to_Groundwater_P24',1,400)

# Aqua Lupa
- a flat prediction, there seems no information avaibable other then 'rainfall...
- here i could try to add weather and rain data from 'italy' in general to see if the errorate becomes smaller, for the time being its an unsolved problem
- exept if we use the 'autocorrelation power by forgetting to omit the P25, the forecast has higher forecast error, although the parameter is most influential

In [None]:
aquaforecast(ws_Lupa,'Flow_Rate_Lupa' ,"index",'Flow_Rate_Lupa',1,400)

In [None]:
ws_Lupa['Dummy']=1

In [None]:
aquaforecast(ws_Lupa,'Flow_Rate_Lupa' ,"index",'Dummy',1,400)

# lake Bilancino

- again a relative flat and uncorrelated forecast, if we drop the Lake level
- keeping the variable Lake lavel in the prediction, narrows the error. You could keep this variable to forecast, since a lake level is something very visible, and you can trust any prediction of that lake level stays within a deviation limit (fe 1meter) of the forecast. 
- imho the flatness means there is no overdraining of the lake, and the source is not stressed to his limits

In [None]:
aquaforecast(lakeBilancino.drop('Lake_Level',axis=1),'Flow_Rate' ,"index",'Flow_Rate',1,400)

In [None]:
aquaforecast(lakeBilancino,'Flow_Rate' ,"index",'Flow_Rate',1,400)

# water system Madonna

- again an unpredictable system..., this means imho there is not yet a limit in that system

In [None]:
aquaforecast(ws_Madonna,'Flow_Rate_Madonna_di_Canneto' ,"index",'Flow_Rate_Madonna_di_Canneto',1,400)

In [None]:
ws_Madonna['Dummy']=1
aquaforecast(ws_Madonna,'Flow_Rate_Madonna_di_Canneto' ,"index",'Dummy',1,400)

# Aqua Auser
- a nice forecast is possible
- beware for the outlayers like 'zero depth' values...
- its not clear if we forecast SAL depth, if the PAG is a variable we should drop too, lets drop all depth levels, the forecast remains narrow



In [None]:
aquaforecast(aq_auser,'Depth_to_Groundwater_SAL' ,"index",'Depth_to_Groundwater_SAL',1,400)

In [None]:
aquaforecast(aq_auser.drop(['Depth_to_Groundwater_LT2', 'Depth_to_Groundwater_PAG',
       'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_DIEC'],axis=1),'Depth_to_Groundwater_SAL' ,"index",'Depth_to_Groundwater_SAL',1,400)