In [97]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [98]:
import os
import pandas as pd
import sys


def read_csv(filename="water_consumption_updated.csv", delimiter=";"):
    if not os.path.exists(filename):
        print(f"File '{filename}' does not exist.")
        sys.exit(1)

    df = pd.read_csv(filename, delimiter=delimiter)
    if "Consumption" in df.columns:
        df["Consumption"] = pd.to_numeric(df["Consumption"].str.replace(",", "."))
        df["Month"] = pd.to_numeric(df["Month"], errors="coerce")
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
        df["Day"] = pd.to_numeric(df["Day"], errors="coerce")
        df["Area"] = pd.to_numeric(df["Area"].str.replace(",", "."), errors="coerce")

    if df.empty or len(df) == 1:
        raise ValueError("Empty file or file with only header.")
    return df


read_csv()

Unnamed: 0,Park,Year,Month,Day,Consumption,Park name,Area
0,Oriental,2023,1,1,20.8000,Oriental,15.0
1,ArcaAgua,2023,1,1,5.2000,Cidade,69.9
2,Covelo,2023,1,1,9.2700,SaoRoque,5.8
3,Cidade,2023,1,1,68.2500,Covelo,7.7
4,Oriental,2023,1,2,19.0000,ArcaAgua,2.6
...,...,...,...,...,...,...,...
3941,Varas,2024,3,27,3.9312,,
3942,Varas,2024,3,28,3.8556,,
3943,Varas,2024,3,29,3.7422,,
3944,Varas,2024,3,30,3.9312,,


In [99]:
def average_park_cost(data: pd.DataFrame):
    grouped_data = data.groupby(["Park name", "Year", "Month"])

    monthly_costs = (
        grouped_data["Consumption"]
        .sum()
        .apply(lambda x: x * 0.7 if x <= 1000 else 1000 * 0.7 + (x - 1000) * 0.7 * 1.15)
    )

    average_costs = monthly_costs.groupby(level=0).mean()
    average_area = data.groupby("Park name")["Area"].mean()

    average_park_data = pd.DataFrame({
        "Cost": average_costs,
        "Area": average_area
    })

    return average_park_data

In [100]:
from sklearn.linear_model import LinearRegression

df = read_csv("water_consumption_updated.csv", delimiter=";")

parks = df[df['Park name'].notna()]['Park name'].unique()

filtered_data = df[df["Park name"].isin(parks)]

monthly_cost = average_park_cost(filtered_data)

# Perform a linear regression
X = monthly_cost["Area"].values.reshape(-1, 1)
y = monthly_cost["Cost"]

model = LinearRegression().fit(X, y)

# Predict the average monthly cost
new_park_area = 55
predicted_cost = model.predict([[new_park_area]])
print(
    f"The predicted average monthly cost for a 55-hectare park is {predicted_cost[0]:.3f}"
)


The predicted average monthly cost for a 55-hectare park is 5.038
