<a href="https://colab.research.google.com/github/Tanvir007/Deep-Learning-for-Sewage-Treatment-Plant/blob/main/Removal_efficiency_and_ANomalies_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
# Load the data from CSV file
data = pd.read_csv('/content/drive/MyDrive/BUET Thesis/DSTP - Copy-copy.csv')

In [None]:
data.head()

Unnamed: 0,Date,EfDischargetoGojaria,Reinef,REpH,RECOD,REBOD,REAmmonia,RESS,REPhosphate,REFecalColiform
0,01-10-2022,257.37,0.01,0.01,0.97,0.98,0.99,0.99,0.63,0.989348
1,02-10-2022,265.45,0.0,-0.02,0.99,0.98,0.99,1.0,0.52,0.993721
2,03-10-2022,285.96,0.01,0.0,0.99,0.97,0.99,0.99,0.5,0.995696
3,04-10-2022,201.39,0.01,-0.01,0.97,0.98,0.99,0.98,0.49,0.995426
4,05-10-2022,276.07,0.03,0.02,0.98,0.98,0.99,0.99,0.59,0.994762


In [None]:
# Extracting the necessary columns for BOD removal efficiency calculation
df = data['Sheet1']

# Calculating BOD removal efficiency
df['BOD_Removal_Efficiency'] = (df['In-BOD₅\n(mg/L)'] - df['Ef-BOD₅\n(mg/L)']) / df['In-BOD₅\n(mg/L)']

# Plotting the removal efficiency over time
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(pd.to_datetime(df['date']), df['BOD_Removal_Efficiency'], color='orange', label='BOD Removal Efficiency')
plt.xlabel('Date')
plt.ylabel('Removal Efficiency')
plt.title('Removal Efficiency of BOD Over Time')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Calculating COD removal efficiency
df['COD_Removal_Efficiency'] = (df['In-CODcr\n(mg/L)'] - df['Ef-CODcr\n(mg/L)']) / df['In-CODcr\n(mg/L)']

# Plotting the removal efficiency over time
plt.figure(figsize=(10, 6))
plt.plot(pd.to_datetime(df['date']), df['COD_Removal_Efficiency'], color='blue', label='COD Removal Efficiency')
plt.xlabel('Date')
plt.ylabel('Removal Efficiency')
plt.title('Removal Efficiency of COD Over Time')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Calculating SS removal efficiency
df['SS_Removal_Efficiency'] = (df['In-SS\n(mg/L)'] - df['Ef-SS\n(mg/L)']) / df['In-SS\n(mg/L)']

# Plotting the removal efficiency over time
plt.figure(figsize=(10, 6))
plt.plot(pd.to_datetime(df['date']), df['SS_Removal_Efficiency'], color='green', label='SS Removal Efficiency')
plt.xlabel('Date')
plt.ylabel('Removal Efficiency')
plt.title('Removal Efficiency of SS Over Time')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
from PIL import Image

# File paths for the uploaded images
file_path_bod = "/mnt/data/Removal Efficiency of BOD Over Time.png"
file_path_cod = "/mnt/data/Removal Efficiency of COD Over Time.png"
file_path_ss = "/mnt/data/Removal Efficiency of SS Over Time.png"

# Open the images
image_bod = Image.open(file_path_bod)
image_cod = Image.open(file_path_cod)
image_ss = Image.open(file_path_ss)

# Combine the images vertically
combined_image = Image.new('RGB', (image_bod.width, image_bod.height + image_cod.height + image_ss.height))
combined_image.paste(image_bod, (0, 0))
combined_image.paste(image_cod, (0, image_bod.height))
combined_image.paste(image_ss, (0, image_bod.height + image_cod.height))

# Save and display the combined image
combined_image_path = "/mnt/data/Combined_Removal_Efficiency.png"
combined_image.save(combined_image_path)

combined_image.show()


In [None]:
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt

# Fit Isolation Forest model
model = IsolationForest(contamination=0.05, random_state=42)
in_cod_data['anomaly'] = model.fit_predict(in_cod_data[['In-COD']])

# Mark anomalies
in_cod_data['anomaly'] = in_cod_data['anomaly'].map({1: 0, -1: 1})

# Visualization
plt.figure(figsize=(10, 6))
plt.plot(in_cod_data.index, in_cod_data['In-COD'], label='In-COD', color='orange')
plt.scatter(in_cod_data[in_cod_data['anomaly'] == 1].index,
            in_cod_data[in_cod_data['anomaly'] == 1]['In-COD'],
            color='red', label='Anomaly', marker='x')
plt.title('Anomaly Detection in In-COD Levels (Isolation Forest)')
plt.xlabel('Date')
plt.ylabel('In-COD (mg/L)')
plt.legend()
plt.grid(True)
plt.show()

# Display the results
import ace_tools as tools; tools.display_dataframe_to_user(name="In-COD Anomaly Detection Results", dataframe=in_cod_data)


In [None]:
# Define a list of influent parameters to analyze
influents = ['In-pH', 'In-CODcr_(mg/L)', 'In-BOD5_(mg/L)', 'In-Ammonia-N_(mg/L)',
             'In-SS_(mg/L)', 'In-Phosphate_(mg/L)', 'In-Fecal_Coliform_(mg/L)']

# Preparing data for anomaly detection
anomaly_results = {}

for influent in influents:
    # Preparing data
    data = df[['date', influent]].copy()
    data['date'] = pd.to_datetime(data['date'])
    data.set_index('date', inplace=True)
    data.rename(columns={influent: 'value'}, inplace=True)

    # Apply Isolation Forest
    model = IsolationForest(contamination=0.05, random_state=42)
    data['anomaly'] = model.fit_predict(data[['value']])
    data['anomaly'] = data['anomaly'].map({1: 0, -1: 1})

    # Save results
    anomaly_results[influent] = data

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['value'], label=influent, color='orange')
    plt.scatter(data[data['anomaly'] == 1].index,
                data[data['anomaly'] == 1]['value'],
                color='red', label='Anomaly', marker='x')
    plt.title(f'Anomaly Detection in {influent} Levels (Isolation Forest)')
    plt.xlabel('Date')
    plt.ylabel(f'{influent} (Units)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Display results for review
for influent, result in anomaly_results.items():
    tools.display_dataframe_to_user(name=f"{influent} Anomaly Detection Results", dataframe=result)


In [None]:
# Define a list of effluent parameters to analyze
effluents = ['Ef-pH', 'Ef-CODcr_(mg/L)', 'Ef-BOD5_(mg/L)', 'Ef-Ammonia-N_(mg/L)',
             'Ef-SS_(mg/L)', 'Ef-Phosphate_(mg/L)', 'Ef-Fecal_Coliform_(mg/L)']

# Preparing data for anomaly detection for effluent parameters
effluent_anomaly_results = {}

for effluent in effluents:
    # Preparing data
    data = df[['date', effluent]].copy()
    data['date'] = pd.to_datetime(data['date'])
    data.set_index('date', inplace=True)
    data.rename(columns={effluent: 'value'}, inplace=True)

    # Apply Isolation Forest
    model = IsolationForest(contamination=0.05, random_state=42)
    data['anomaly'] = model.fit_predict(data[['value']])
    data['anomaly'] = data['anomaly'].map({1: 0, -1: 1})

    # Save results
    effluent_anomaly_results[effluent] = data

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['value'], label=effluent, color='orange')
    plt.scatter(data[data['anomaly'] == 1].index,
                data[data['anomaly'] == 1]['value'],
                color='red', label='Anomaly', marker='x')
    plt.title(f'Anomaly Detection in {effluent} Levels (Isolation Forest)')
    plt.xlabel('Date')
    plt.ylabel(f'{effluent} (Units)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Display results for review
for effluent, result in effluent_anomaly_results.items():
    tools.display_dataframe_to_user(name=f"{effluent} Anomaly Detection Results", dataframe=result)


In [None]:
# Create a function to calculate time lags between influent and effluent anomalies
def calculate_time_lag(influent_data, effluent_data):
    # Filter dates with anomalies
    influent_anomalies = influent_data[influent_data['anomaly'] == 1].index
    effluent_anomalies = effluent_data[effluent_data['anomaly'] == 1].index

    time_lags = []

    for eff_date in effluent_anomalies:
        # Find the nearest influent anomaly before the effluent anomaly
        influent_before = influent_anomalies[influent_anomalies <= eff_date]
        if not influent_before.empty:
            lag = (eff_date - influent_before[-1]).days
            time_lags.append(lag)

    return time_lags

# Calculate time lags for each influent-effluent pair
time_lag_results = {}

for influent in influents:
    for effluent in effluents:
        infl_data = anomaly_results[influent]
        effl_data = effluent_anomaly_results[effluent]

        # Calculate time lags
        lags = calculate_time_lag(infl_data, effl_data)

        if lags:
            time_lag_results[(influent, effluent)] = lags

# Display results
time_lag_summary = {f'{infl}_{eff}': lags for (infl, eff), lags in time_lag_results.items()}
time_lag_summary


In [None]:
# Calculate correlations between influent and effluent parameters

# Create a new DataFrame with influent and effluent parameters aligned
merged_data = df.set_index('date')[influents + effluents]

# Calculate correlation matrix
correlation_matrix = merged_data.corr()

# Extracting only the correlations between influents and effluents
correlations_influent_effluent = correlation_matrix.loc[influents, effluents]

# Display the results
tools.display_dataframe_to_user(name="Correlation between Influent and Effluent Parameters", dataframe=correlations_influent_effluent)


In [None]:
# Flatten the time lag results for visualization
time_lag_values = [lag for lags in time_lag_results.values() for lag in lags]

# Visualization of time lag distribution
plt.figure(figsize=(10, 6))
plt.hist(time_lag_values, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Time Lags between Influent and Effluent Anomalies')
plt.xlabel('Time Lag (Days)')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()


In [None]:
# Add month information to each anomaly in the time lag data
time_lag_with_month = []

for (influent, effluent), lags in time_lag_results.items():
    for lag in lags:
        # Retrieve the effluent dates corresponding to the time lags
        effluent_data = effluent_anomaly_results[effluent]
        effluent_anomalies = effluent_data[effluent_data['anomaly'] == 1].index

        for eff_date in effluent_anomalies:
            influent_before = anomaly_results[influent][anomaly_results[influent]['anomaly'] == 1].index
            influent_before = influent_before[influent_before <= eff_date]

            if not influent_before.empty and (eff_date - influent_before[-1]).days == lag:
                time_lag_with_month.append({'influent': influent, 'effluent': effluent, 'lag': lag, 'month': eff_date.month})

# Convert to DataFrame
time_lag_df = pd.DataFrame(time_lag_with_month)

# Group by month to find average lag per month
average_lag_per_month = time_lag_df.groupby('month')['lag'].mean().reset_index()

# Visualization of average time lag by month
plt.figure(figsize=(10, 6))
plt.plot(average_lag_per_month['month'], average_lag_per_month['lag'], marker='o', color='orange')
plt.title('Average Time Lag between Influent and Effluent Anomalies by Month')
plt.xlabel('Month')
plt.ylabel('Average Time Lag (Days)')
plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()


In [None]:
# Add month information to the effluent anomaly data for seasonal analysis
effluent_data_with_month = effluent_anomaly_results.copy()
for effluent, data in effluent_data_with_month.items():
    data['month'] = data.index.month

# Calculate monthly average for each effluent parameter
monthly_average_effluent = {}

for effluent, data in effluent_data_with_month.items():
    monthly_avg = data.groupby('month')['value'].mean().reset_index()
    monthly_average_effluent[effluent] = monthly_avg

# Plot seasonal variations for each effluent parameter
for effluent, monthly_avg in monthly_average_effluent.items():
    plt.figure(figsize=(10, 6))
    plt.plot(monthly_avg['month'], monthly_avg['value'], marker='o', color='orange')
    plt.title(f'Seasonal Variation in {effluent} Levels')
    plt.xlabel('Month')
    plt.ylabel(f'{effluent} (Units)')
    plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(True)
    plt.show()


In [None]:
# Add month information to the influent anomaly data for seasonal analysis
influent_data_with_month = anomaly_results.copy()
for influent, data in influent_data_with_month.items():
    data['month'] = data.index.month

# Calculate monthly average for each influent parameter
monthly_average_influent = {}

for influent, data in influent_data_with_month.items():
    monthly_avg = data.groupby('month')['value'].mean().reset_index()
    monthly_average_influent[influent] = monthly_avg

# Plot seasonal variations for each influent parameter
for influent, monthly_avg in monthly_average_influent.items():
    plt.figure(figsize=(10, 6))
    plt.plot(monthly_avg['month'], monthly_avg['value'], marker='o', color='orange')
    plt.title(f'Seasonal Variation in {influent} Levels')
    plt.xlabel('Month')
    plt.ylabel(f'{influent} (Units)')
    plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(True)
    plt.show()


In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Preparing data for LOF analysis
lof_influents = influent_data_with_month.copy()

# Apply LOF to each influent parameter
lof_results = {}

for influent, data in lof_influents.items():
    data = data[['value']].copy()

    # Apply LOF
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    data['anomaly'] = lof.fit_predict(data)
    data['anomaly'] = data['anomaly'].map({1: 0, -1: 1})

    # Save results
    lof_results[influent] = data

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['value'], label=influent, color='orange')
    plt.scatter(data[data['anomaly'] == 1].index,
                data[data['anomaly'] == 1]['value'],
                color='red', label='Anomaly', marker='x')
    plt.title(f'Anomaly Detection in {influent} Levels (Local Outlier Factor)')
    plt.xlabel('Date')
    plt.ylabel(f'{influent} (Units)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Display results for review
for influent, result in lof_results.items():
    tools.display_dataframe_to_user(name=f"{influent} LOF Anomaly Detection Results", dataframe=result)


In [None]:
# Preparing data for LOF analysis for effluent parameters
lof_effluents = effluent_data_with_month.copy()

# Apply LOF to each effluent parameter
lof_effluent_results = {}

for effluent, data in lof_effluents.items():
    data = data[['value']].copy()

    # Apply LOF
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    data['anomaly'] = lof.fit_predict(data)
    data['anomaly'] = data['anomaly'].map({1: 0, -1: 1})

    # Save results
    lof_effluent_results[effluent] = data

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['value'], label=effluent, color='orange')
    plt.scatter(data[data['anomaly'] == 1].index,
                data[data['anomaly'] == 1]['value'],
                color='red', label='Anomaly', marker='x')
    plt.title(f'Anomaly Detection in {effluent} Levels (Local Outlier Factor)')
    plt.xlabel('Date')
    plt.ylabel(f'{effluent} (Units)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Display results for review
for effluent, result in lof_effluent_results.items():
    tools.display_dataframe_to_user(name=f"{effluent} LOF Anomaly Detection Results", dataframe=result)


In [None]:
# Reusing the previous function to calculate time lags between influent and effluent anomalies for LOF results

def calculate_time_lag_lof(influent_data, effluent_data):
    # Filter dates with anomalies
    influent_anomalies = influent_data[influent_data['anomaly'] == 1].index
    effluent_anomalies = effluent_data[effluent_data['anomaly'] == 1].index

    time_lags = []

    for eff_date in effluent_anomalies:
        # Find the nearest influent anomaly before the effluent anomaly
        influent_before = influent_anomalies[influent_anomalies <= eff_date]
        if not influent_before.empty:
            lag = (eff_date - influent_before[-1]).days
            time_lags.append(lag)

    return time_lags

# Calculate time lags for each influent-effluent pair using LOF results
time_lag_results_lof = {}

for influent in influents:
    for effluent in effluents:
        infl_data = lof_results[influent]
        effl_data = lof_effluent_results[effluent]

        # Calculate time lags
        lags = calculate_time_lag_lof(infl_data, effl_data)

        if lags:
            time_lag_results_lof[(influent, effluent)] = lags

# Display results
time_lag_summary_lof = {f'{infl}_{eff}': lags for (infl, eff), lags in time_lag_results_lof.items()}
time_lag_summary_lof


In [None]:
# Calculate correlations between influent and effluent parameters using LOF results

# Create a new DataFrame with influent and effluent parameters aligned using LOF anomaly results
merged_lof_data = pd.DataFrame(index=df['date'])

# Add influent parameters to the merged DataFrame
for influent, data in lof_results.items():
    merged_lof_data[influent] = data['value'].reindex(merged_lof_data.index)

# Add effluent parameters to the merged DataFrame
for effluent, data in lof_effluent_results.items():
    merged_lof_data[effluent] = data['value'].reindex(merged_lof_data.index)

# Calculate correlation matrix
correlation_matrix_lof = merged_lof_data.corr()

# Extracting only the correlations between influents and effluents
correlations_influent_effluent_lof = correlation_matrix_lof.loc[influents, effluents]

# Display the results
tools.display_dataframe_to_user(name="LOF Correlation between Influent and Effluent Parameters", dataframe=correlations_influent_effluent_lof)


In [None]:
# Add month information to each anomaly in the time lag data for LOF results
time_lag_with_month_lof = []

for (influent, effluent), lags in time_lag_results_lof.items():
    for lag in lags:
        # Retrieve the effluent dates corresponding to the time lags
        effluent_data = lof_effluent_results[effluent]
        effluent_anomalies = effluent_data[effluent_data['anomaly'] == 1].index

        for eff_date in effluent_anomalies:
            influent_before = lof_results[influent][lof_results[influent]['anomaly'] == 1].index
            influent_before = influent_before[influent_before <= eff_date]

            if not influent_before.empty and (eff_date - influent_before[-1]).days == lag:
                time_lag_with_month_lof.append({'influent': influent, 'effluent': effluent, 'lag': lag, 'month': eff_date.month})

# Convert to DataFrame
time_lag_df_lof = pd.DataFrame(time_lag_with_month_lof)

# Group by month to find average lag per month
average_lag_per_month_lof = time_lag_df_lof.groupby('month')['lag'].mean().reset_index()

# Visualization of average time lag by month
plt.figure(figsize=(10, 6))
plt.plot(average_lag_per_month_lof['month'], average_lag_per_month_lof['lag'], marker='o', color='orange')
plt.title('Average Time Lag between Influent and Effluent Anomalies by Month (LOF)')
plt.xlabel('Month')
plt.ylabel('Average Time Lag (Days)')
plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()


In [None]:
# Add month information to the effluent LOF anomaly data for seasonal analysis
for effluent, data in lof_effluent_results.items():
    data['month'] = data.index.month

# Calculate monthly average for each effluent parameter
monthly_average_effluent_lof = {}

for effluent, data in lof_effluent_results.items():
    monthly_avg = data.groupby('month')['value'].mean().reset_index()
    monthly_average_effluent_lof[effluent] = monthly_avg

# Plot seasonal variations for each effluent parameter using LOF results
for effluent, monthly_avg in monthly_average_effluent_lof.items():
    plt.figure(figsize=(10, 6))
    plt.plot(monthly_avg['month'], monthly_avg['value'], marker='o', color='orange')
    plt.title(f'Seasonal Variation in {effluent} Levels (LOF)')
    plt.xlabel('Month')
    plt.ylabel(f'{effluent} (Units)')
    plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(True)
    plt.show()


In [None]:
# Add month information to the influent LOF anomaly data for seasonal analysis
for influent, data in lof_results.items():
    data['month'] = data.index.month

# Calculate monthly average for each influent parameter
monthly_average_influent_lof = {}

for influent, data in lof_results.items():
    monthly_avg = data.groupby('month')['value'].mean().reset_index()
    monthly_average_influent_lof[influent] = monthly_avg

# Plot seasonal variations for each influent parameter using LOF results
for influent, monthly_avg in monthly_average_influent_lof.items():
    plt.figure(figsize=(10, 6))
    plt.plot(monthly_avg['month'], monthly_avg['value'], marker='o', color='orange')
    plt.title(f'Seasonal Variation in {influent} Levels (LOF)')
    plt.xlabel('Month')
    plt.ylabel(f'{influent} (Units)')
    plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(True)
    plt.show()


In [None]:
from sklearn.svm import OneClassSVM

# Preparing data for OCSVM analysis
ocsvm_influents = influent_data_with_month.copy()

# Apply OCSVM to each influent parameter
ocsvm_results = {}

for influent, data in ocsvm_influents.items():
    data = data[['value']].copy()

    # Apply OCSVM
    ocsvm = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
    data['anomaly'] = ocsvm.fit_predict(data)
    data['anomaly'] = data['anomaly'].map({1: 0, -1: 1})

    # Save results
    ocsvm_results[influent] = data

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['value'], label=influent, color='orange')
    plt.scatter(data[data['anomaly'] == 1].index,
                data[data['anomaly'] == 1]['value'],
                color='red', label='Anomaly', marker='x')
    plt.title(f'Anomaly Detection in {influent} Levels (OCSVM)')
    plt.xlabel('Date')
    plt.ylabel(f'{influent} (Units)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Display results for review
for influent, result in ocsvm_results.items():
    tools.display_dataframe_to_user(name=f"{influent} OCSVM Anomaly Detection Results", dataframe=result)


In [None]:
# Preparing data for OCSVM analysis for effluent parameters
ocsvm_effluents = effluent_data_with_month.copy()

# Apply OCSVM to each effluent parameter
ocsvm_effluent_results = {}

for effluent, data in ocsvm_effluents.items():
    data = data[['value']].copy()

    # Apply OCSVM
    ocsvm = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
    data['anomaly'] = ocsvm.fit_predict(data)
    data['anomaly'] = data['anomaly'].map({1: 0, -1: 1})

    # Save results
    ocsvm_effluent_results[effluent] = data

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['value'], label=effluent, color='orange')
    plt.scatter(data[data['anomaly'] == 1].index,
                data[data['anomaly'] == 1]['value'],
                color='red', label='Anomaly', marker='x')
    plt.title(f'Anomaly Detection in {effluent} Levels (OCSVM)')
    plt.xlabel('Date')
    plt.ylabel(f'{effluent} (Units)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Display results for review
for effluent, result in ocsvm_effluent_results.items():
    tools.display_dataframe_to_user(name=f"{effluent} OCSVM Anomaly Detection Results", dataframe=result)


In [None]:
# Reusing the previous function to calculate time lags between influent and effluent anomalies for OCSVM results

def calculate_time_lag_ocsvm(influent_data, effluent_data):
    # Filter dates with anomalies
    influent_anomalies = influent_data[influent_data['anomaly'] == 1].index
    effluent_anomalies = effluent_data[effluent_data['anomaly'] == 1].index

    time_lags = []

    for eff_date in effluent_anomalies:
        # Find the nearest influent anomaly before the effluent anomaly
        influent_before = influent_anomalies[influent_anomalies <= eff_date]
        if not influent_before.empty:
            lag = (eff_date - influent_before[-1]).days
            time_lags.append(lag)

    return time_lags

# Calculate time lags for each influent-effluent pair using OCSVM results
time_lag_results_ocsvm = {}

for influent in influents:
    for effluent in effluents:
        infl_data = ocsvm_results[influent]
        effl_data = ocsvm_effluent_results[effluent]

        # Calculate time lags
        lags = calculate_time_lag_ocsvm(infl_data, effl_data)

        if lags:
            time_lag_results_ocsvm[(influent, effluent)] = lags

# Display results
time_lag_summary_ocsvm = {f'{infl}_{eff}': lags for (infl, eff), lags in time_lag_results_ocsvm.items()}
time_lag_summary_ocsvm


In [None]:
# Calculate correlations between influent and effluent parameters using OCSVM results

# Create a new DataFrame with influent and effluent parameters aligned using OCSVM anomaly results
merged_ocsvm_data = pd.DataFrame(index=df['date'])

# Add influent parameters to the merged DataFrame
for influent, data in ocsvm_results.items():
    merged_ocsvm_data[influent] = data['value'].reindex(merged_ocsvm_data.index)

# Add effluent parameters to the merged DataFrame
for effluent, data in ocsvm_effluent_results.items():
    merged_ocsvm_data[effluent] = data['value'].reindex(merged_ocsvm_data.index)

# Calculate correlation matrix
correlation_matrix_ocsvm = merged_ocsvm_data.corr()

# Extracting only the correlations between influents and effluents
correlations_influent_effluent_ocsvm = correlation_matrix_ocsvm.loc[influents, effluents]

# Display the results
tools.display_dataframe_to_user(name="OCSVM Correlation between Influent and Effluent Parameters", dataframe=correlations_influent_effluent_ocsvm)


In [None]:
# Flatten the time lag results for visualization using OCSVM results
time_lag_values_ocsvm = [lag for lags in time_lag_results_ocsvm.values() for lag in lags]

# Visualization of time lag distribution using OCSVM results
plt.figure(figsize=(10, 6))
plt.hist(time_lag_values_ocsvm, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Time Lags between Influent and Effluent Anomalies (OCSVM)')
plt.xlabel('Time Lag (Days)')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()


In [None]:
# Add month information to each anomaly in the time lag data for OCSVM results
time_lag_with_month_ocsvm = []

for (influent, effluent), lags in time_lag_results_ocsvm.items():
    for lag in lags:
        # Retrieve the effluent dates corresponding to the time lags
        effluent_data = ocsvm_effluent_results[effluent]
        effluent_anomalies = effluent_data[effluent_data['anomaly'] == 1].index

        for eff_date in effluent_anomalies:
            influent_before = ocsvm_results[influent][ocsvm_results[influent]['anomaly'] == 1].index
            influent_before = influent_before[influent_before <= eff_date]

            if not influent_before.empty and (eff_date - influent_before[-1]).days == lag:
                time_lag_with_month_ocsvm.append({'influent': influent, 'effluent': effluent, 'lag': lag, 'month': eff_date.month})

# Convert to DataFrame
time_lag_df_ocsvm = pd.DataFrame(time_lag_with_month_ocsvm)

# Group by month to find average lag per month
average_lag_per_month_ocsvm = time_lag_df_ocsvm.groupby('month')['lag'].mean().reset_index()

# Visualization of average time lag by month
plt.figure(figsize=(10, 6))
plt.plot(average_lag_per_month_ocsvm['month'], average_lag_per_month_ocsvm['lag'], marker='o', color='orange')
plt.title('Average Time Lag between Influent and Effluent Anomalies by Month (OCSVM)')
plt.xlabel('Month')
plt.ylabel('Average Time Lag (Days)')
plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()


In [None]:
# Optimize by pre-computing anomaly dates
# Create dictionaries for quicker lookups
influent_anomaly_dates = {influent: ocsvm_results[influent][ocsvm_results[influent]['anomaly'] == 1].index for influent in ocsvm_results}
effluent_anomaly_dates = {effluent: ocsvm_effluent_results[effluent][ocsvm_effluent_results[effluent]['anomaly'] == 1].index for effluent in ocsvm_effluent_results}

# Re-calculate time lags with optimization
optimized_time_lag_with_month_ocsvm = []

for (influent, effluent), lags in time_lag_results_ocsvm.items():
    for lag in lags:
        # Iterate through effluent anomalies
        for eff_date in effluent_anomaly_dates[effluent]:
            influent_before = influent_anomaly_dates[influent][influent_anomaly_dates[influent] <= eff_date]

            if not influent_before.empty and (eff_date - influent_before[-1]).days == lag:
                optimized_time_lag_with_month_ocsvm.append({'influent': influent, 'effluent': effluent, 'lag': lag, 'month': eff_date.month})

# Convert to DataFrame
optimized_time_lag_df_ocsvm = pd.DataFrame(optimized_time_lag_with_month_ocsvm)

# Group by month to find average lag per month
optimized_average_lag_per_month_ocsvm = optimized_time_lag_df_ocsvm.groupby('month')['lag'].mean().reset_index()

# Visualization of average time lag by month
plt.figure(figsize=(10, 6))
plt.plot(optimized_average_lag_per_month_ocsvm['month'], optimized_average_lag_per_month_ocsvm['lag'], marker='o', color='orange')
plt.title('Average Time Lag between Influent and Effluent Anomalies by Month (OCSVM)')
plt.xlabel('Month')
plt.ylabel('Average Time Lag (Days)')
plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()


In [None]:
# Add month information to the effluent OCSVM anomaly data for seasonal analysis
for effluent, data in ocsvm_effluent_results.items():
    data['month'] = data.index.month

# Calculate monthly average for each effluent parameter
monthly_average_effluent_ocsvm = {}

for effluent, data in ocsvm_effluent_results.items():
    monthly_avg = data.groupby('month')['value'].mean().reset_index()
    monthly_average_effluent_ocsvm[effluent] = monthly_avg

# Plot seasonal variations for each effluent parameter using OCSVM results
for effluent, monthly_avg in monthly_average_effluent_ocsvm.items():
    plt.figure(figsize=(10, 6))
    plt.plot(monthly_avg['month'], monthly_avg['value'], marker='o', color='orange')
    plt.title(f'Seasonal Variation in {effluent} Levels (OCSVM)')
    plt.xlabel('Month')
    plt.ylabel(f'{effluent} (Units)')
    plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(True)
    plt.show()


In [None]:
# Add month information to the influent OCSVM anomaly data for seasonal analysis
for influent, data in ocsvm_results.items():
    data['month'] = data.index.month

# Calculate monthly average for each influent parameter
monthly_average_influent_ocsvm = {}

for influent, data in ocsvm_results.items():
    monthly_avg = data.groupby('month')['value'].mean().reset_index()
    monthly_average_influent_ocsvm[influent] = monthly_avg

# Plot seasonal variations for each influent parameter using OCSVM results
for influent, monthly_avg in monthly_average_influent_ocsvm.items():
    plt.figure(figsize=(10, 6))
    plt.plot(monthly_avg['month'], monthly_avg['value'], marker='o', color='orange')
    plt.title(f'Seasonal Variation in {influent} Levels (OCSVM)')
    plt.xlabel('Month')
    plt.ylabel(f'{influent} (Units)')
    plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(True)
    plt.show()


In [None]:
# Flatten the time lag results for visualization using LOF results
time_lag_values_lof = [lag for lags in time_lag_results_lof.values() for lag in lags]

# Visualization of time lag distribution using LOF results
plt.figure(figsize=(10, 6))
plt.hist(time_lag_values_lof, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Time Lags between Influent and Effluent Anomalies (LOF)')
plt.xlabel('Time Lag (Days)')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()


In [None]:
# Combining anomaly results from all three models: Isolation Forest, LOF, and OCSVM

# Function to combine results from all models
def combine_anomaly_results(df, isolation_forest_results, lof_results, ocsvm_results):
    combined_df = df.copy()
    combined_df['Isolation_Forest'] = isolation_forest_results['anomaly'].reindex(df.index, fill_value=0)
    combined_df['LOF'] = lof_results['anomaly'].reindex(df.index, fill_value=0)
    combined_df['OCSVM'] = ocsvm_results['anomaly'].reindex(df.index, fill_value=0)
    combined_df['Combined_Anomaly'] = (combined_df['Isolation_Forest'] +
                                       combined_df['LOF'] +
                                       combined_df['OCSVM'])
    combined_df['Anomaly_Flag'] = combined_df['Combined_Anomaly'] >= 2  # Anomaly if detected by at least 2 models
    return combined_df

# Apply the function for influent parameters
combined_results_influent = {}
for influent in influents:
    combined_results_influent[influent] = combine_anomaly_results(df.set_index('date')[[influent]],
                                                                  anomaly_results[influent],
                                                                  lof_results[influent],
                                                                  ocsvm_results[influent])

# Apply the function for effluent parameters
combined_results_effluent = {}
for effluent in effluents:
    combined_results_effluent[effluent] = combine_anomaly_results(df.set_index('date')[[effluent]],
                                                                  effluent_anomaly_results[effluent],
                                                                  lof_effluent_results[effluent],
                                                                  ocsvm_effluent_results[effluent])

# Display combined results for influent parameters
for influent, result in combined_results_influent.items():
    tools.display_dataframe_to_user(name=f"{influent} Combined Anomaly Detection Results", dataframe=result)

# Display combined results for effluent parameters
for effluent, result in combined_results_effluent.items():
    tools.display_dataframe_to_user(name=f"{effluent} Combined Anomaly Detection Results", dataframe=result)



In [None]:
# Calculating anomaly frequencies for each influent and effluent parameter

# Function to calculate anomaly frequencies
def calculate_anomaly_frequency(combined_results):
    anomaly_frequencies = {}
    for parameter, result in combined_results.items():
        frequency = result['Anomaly_Flag'].sum()
        anomaly_frequencies[parameter] = frequency
    return anomaly_frequencies

# Calculate anomaly frequencies for influent and effluent parameters
anomaly_frequency_influent = calculate_anomaly_frequency(combined_results_influent)
anomaly_frequency_effluent = calculate_anomaly_frequency(combined_results_effluent)

# Convert to DataFrame for easier plotting
anomaly_frequency_df = pd.DataFrame({
    'Parameter': list(anomaly_frequency_influent.keys()) + list(anomaly_frequency_effluent.keys()),
    'Frequency': list(anomaly_frequency_influent.values()) + list(anomaly_frequency_effluent.values()),
    'Type': ['Influent'] * len(anomaly_frequency_influent) + ['Effluent'] * len(anomaly_frequency_effluent)
})

# Plotting anomaly frequencies
plt.figure(figsize=(12, 8))
plt.bar(anomaly_frequency_df['Parameter'], anomaly_frequency_df['Frequency'], color='skyblue')
plt.title('Anomaly Frequencies Across All Models (Isolation Forest, LOF, OCSVM)')
plt.xlabel('Parameter')
plt.ylabel('Anomaly Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.show()
