In [1]:
folder_path = "/content/drive/MyDrive/filtered/" # Change to folder containing csv files

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Function to get SMAPE score
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

## **Data Cleaning Process**

In [None]:
# Data Cleaning

for i in os.listdir(folder_path):
  path=os.path.join(folder_path, i)
  df=pd.read_csv(path, encoding='latin-1', header=None)
  df["data"]=df[0].str.split(';')
  df_=pd.DataFrame(df["data"].to_list(), columns=['Date', 'Data1', 'Data2'])
  df[1]=df[1].str.strip(' ;')
  df_[1]=df[1]
  df_["Levels"]=df_["Data1"]+"."+df_[1]
  df_=df_[["Date","Levels"]]
  df_["Levels"]=df_["Levels"].astype("float64")
  df_.to_csv(path, index=False)

## **Feature Extraction Technique with LightGBM Model**

In [5]:
# Instantiate Model
model_name="LightGBM"
model=LGBMRegressor(verbose=-1)

# Create empty lists
Pred=[]
actuals=[]

# Loop through each csv file
for i in os.listdir(folder_path):

  # Read the csv file
  path=os.path.join(folder_path, i)
  df=pd.read_csv(path)

  # Remove last row and fill na's
  df=df.iloc[:-1,:]
  df.ffill(inplace=True)

  # Convert date to datetime feature
  df["Date"]=pd.to_datetime(df["Date"], format="%d.%m.%Y %H:%M:%S   ")
  df.sort_values(by="Date", inplace=True)

  # Extract datetime features
  df["Day"]=df["Date"].dt.day
  df["Dayofweek"]=df["Date"].dt.dayofweek
  df["Month"]=df["Date"].dt.month
  df["Year"]=df["Date"].dt.year
  df["Quarter"]=df["Date"].dt.quarter
  df["Week"]=df["Date"].dt.isocalendar().week
  df=df.drop("Date", axis=1)

  # Split data to train and test
  train=df.iloc[:-26,:]
  test=df.iloc[-26:, :]

  # Split data to dependent and independent variables
  X_train=train.drop("Levels", axis=1)
  y_train=train["Levels"]
  X_test=test.drop("Levels", axis=1)
  y_test=test["Levels"]

  # Train model
  model.fit(X_train, y_train)
  pred=model.predict(X_test)

  # Test the model
  for yss in y_test.values:
    actuals.append(yss)
  for yss in pred:
    Pred.append(yss)

# Create dataframe for y_tests and corresponding Predictions
test=pd.DataFrame({"Predictions":Pred, "Actuals":actuals}).dropna()

# Calculate SMAPE Score
A = test["Actuals"]
F = test["Predictions"]
print(model_name+" SMAPE Score: ", smape(A, F))

LightGBM SMAPE Score:  0.1208221921834496


## **Forecast for 2022 - 2024 Feb**

In [6]:
# Create a date range starting from 01.01.2022 with monthly frequency
date_range = pd.date_range(start='2022-01-01', end='2024-02-01', freq='MS')

# Create a DataFrame
df_ = pd.DataFrame(date_range, columns=['Date'])

# Extract Features
df_["Day"]=df_["Date"].dt.day
df_["Dayofweek"]=df_["Date"].dt.dayofweek
df_["Month"]=df_["Date"].dt.month
df_["Year"]=df_["Date"].dt.year
df_["Quarter"]=df_["Date"].dt.quarter
df_["Week"]=df_["Date"].dt.isocalendar().week

df_.drop("Date", axis=1, inplace=True)

In [7]:
Pred=[]

# Loop through each CSV file
for i in os.listdir(folder_path):

    # Read CSV, drop 2022 and fill na
    path=os.path.join(folder_path, i)
    df=pd.read_csv(path)
    df=df.iloc[:-1,:]
    df.ffill(inplace=True)

    # Change Date column to datetime features
    df["Date"]=pd.to_datetime(df["Date"], format="%d.%m.%Y %H:%M:%S   ")
    df.sort_values(by="Date", inplace=True)

    # Extract Date features
    df["Day"]=df["Date"].dt.day
    df["Dayofweek"]=df["Date"].dt.dayofweek
    df["Month"]=df["Date"].dt.month
    df["Year"]=df["Date"].dt.year
    df["Quarter"]=df["Date"].dt.quarter
    df["Week"]=df["Date"].dt.isocalendar().week
    df=df.drop("Date", axis=1)

    # Split to dependent and independent variables
    X=df.drop("Levels", axis=1)
    y=df["Levels"]

    # Train model
    model=LGBMRegressor(verbose=-1)
    model.fit(X, y)

    # Make forecast
    pred=model.predict(df_)

    # Append forecast and site id to Pred list
    Pred.append([i.split(".")[0], pred])

# Generate dataframe for forecasted data
preds=pd.DataFrame(Pred)
sites=preds[0].values
preds=pd.DataFrame(preds[1].to_list())
preds['Sites']=sites
preds.set_index('Sites', inplace=True)
preds=preds.T
preds["Date"] = date_range
preds.set_index("Date", inplace=True)
preds=preds.round(2)
preds
preds.to_csv("groundwater_forecasts.csv")

## **Train and Make Prediction For Specific Geographic Location**

In [8]:
# Define the file path for the training data
file_path = "/content/drive/MyDrive/filtered/330803.csv"  # Change to the specific data file path
# Define the start and end dates for the forecast
start_date = "2022-01-01"  # Change to desired start date
end_date = "2024-02-01"    # Change to desired end date


In [9]:
def make_forecast_dates(start_date = "2022-01-01", end_date="2024-02-01"):

  # Create a date range starting from start date with monthly frequency
  date_range = pd.date_range(start=start_date, end=end_date, freq='MS')

  # Create a DataFrame
  df_ = pd.DataFrame(date_range, columns=['Date'])

  # Extract Date Features from DateFrame
  df_["Day"]=df_["Date"].dt.day
  df_["Dayofweek"]=df_["Date"].dt.dayofweek
  df_["Month"]=df_["Date"].dt.month
  df_["Year"]=df_["Date"].dt.year
  df_["Quarter"]=df_["Date"].dt.quarter
  df_["Week"]=df_["Date"].dt.isocalendar().week

  return df_

In [10]:
def train_model(file_path):

  # Read in the dataset for specific loactaion
  df=pd.read_csv(file_path)

  # Remove 2022 row
  df=df.iloc[:-1,:]

  # Fill missing values
  df.ffill(inplace=True)

  # Convert date column to datetime type
  df["Date"]=pd.to_datetime(df["Date"], format="%d.%m.%Y %H:%M:%S   ")
  df.sort_values(by="Date", inplace=True)

  # Extract Features
  df["Day"]=df["Date"].dt.day
  df["Dayofweek"]=df["Date"].dt.dayofweek
  df["Month"]=df["Date"].dt.month
  df["Year"]=df["Date"].dt.year
  df["Quarter"]=df["Date"].dt.quarter
  df["Week"]=df["Date"].dt.isocalendar().week

  # Split data to dependent and independent variables
  df=df.drop("Date", axis=1)
  X=df.drop("Levels", axis=1)
  y=df["Levels"]

  # Train and return the model
  model=LGBMRegressor(verbose=-1)
  model.fit(X, y)
  return model

In [11]:
def make_forecasts(model, forecast_dates):

  forecasts=pd.DataFrame()

  # Make Predictions for forecast_dates
  forecasts["Date"]= forecast_dates["Date"]
  forecast_dates.drop("Date", axis=1, inplace=True)
  pred=model.predict(forecast_dates)
  forecasts["Levels"]=pred
  forecasts=forecasts.round(2)

  return forecasts

In [12]:
# Generate DataFrame for forecast dates
forecast_dates = make_forecast_dates(start_date=start_date, end_date=end_date)

# Train the model using the specified data file
model = train_model(file_path)

# Make forecasts using the trained model and forecast dates
forecasts = make_forecasts(model, forecast_dates)

# Optionally, print or save the forecasts
print(forecasts)
forecasts.to_csv("forecasts.csv", index=False)
print("\nFORECASTS SAVED SUCCESSFULLY")

         Date  Levels
0  2022-01-01  637.85
1  2022-02-01  637.77
2  2022-03-01  637.84
3  2022-04-01  638.24
4  2022-05-01  638.45
5  2022-06-01  638.48
6  2022-07-01  638.44
7  2022-08-01  638.31
8  2022-09-01  638.33
9  2022-10-01  638.33
10 2022-11-01  638.26
11 2022-12-01  638.23
12 2023-01-01  637.75
13 2023-02-01  637.74
14 2023-03-01  637.81
15 2023-04-01  638.28
16 2023-05-01  638.48
17 2023-06-01  638.52
18 2023-07-01  638.44
19 2023-08-01  638.31
20 2023-09-01  638.35
21 2023-10-01  638.24
22 2023-11-01  638.22
23 2023-12-01  638.24
24 2024-01-01  637.82
25 2024-02-01  637.78

FORECASTS SAVED SUCCESSFULLY
