In [1]:
import mlflow
import pickle
import sys
sys.path.insert(1, '../library')
import database_helper
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
import numpy as np
from dotenv import load_dotenv


In [None]:
load_dotenv()
# MLFLOW_TRACKING_USERNAME are set via dotenv
# "MLFLOW_TRACKING_PASSWORD are set via dotenv
mlflow.set_tracking_uri(uri="http://mlflow.local:80")
mlflow.set_experiment("DWD Isolation Forest with PCA")
mlflow.autolog()

In [None]:
with open('selected_columns.pkl', 'rb') as f:
    selected_columns = pickle.load(f)

if selected_columns == None or len(selected_columns) > 100:
    selected_columns = ['10838_days_0_precipitation',
                        '10838_days_0_sunrise',
                        '10838_days_0_sunset',
                        '10838_days_0_sunshine',
                        '10838_days_0_temperatureMax',
                        '10838_days_0_temperatureMin',
                        '10838_days_0_windDirection',
                        '10838_days_0_windGust',
                        '10838_days_0_windSpeed']

dataframe = database_helper.query_data(field_list=selected_columns)
dataframe['_time'] = pd.to_datetime(dataframe['_time'])
dataframe.set_index('_time', inplace=True)
dataframe = dataframe.select_dtypes(include='float64')
dataframe.interpolate(inplace=True)
display(dataframe)

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataframe)
display(data_scaled)

In [None]:
#pca = PCA(n_components=10)  # You can adjust the number of components
pca = PCA()
threshold = 0.99
data_pca = pca.fit(data_scaled)
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_explained_variance >= threshold) + 1

pca_optimal = PCA(n_components=n_components)
data_pca = pca_optimal.fit_transform(data_scaled)

# Print the result
print(f"Number of components needed to explain 99% of the variance: {n_components}")
print("Shape of the transformed dataset:", data_pca.shape)

# Plot the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.axhline(y=threshold, color='r', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Principal Components')
plt.show()



In [6]:
# Fit the model
model = IsolationForest(contamination=0.05)  # Adjust contamination based on your needs
model.fit(data_pca)
# Predict anomalies
anomalies = model.predict(data_pca)
# -1 for anomalies, 1 for normal instances
dataframe['anomaly'] = anomalies

In [None]:
# Get the anomaly data
anomalies = dataframe[dataframe['anomaly'] == -1]
display(anomalies)
# Visualize the anomalies (optional)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=dataframe['anomaly'], cmap='coolwarm')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Anomaly Detection using Isolation Forest')
plt.show()