In [1]:
import seaborn as sns
import pickle
import sys
import os
from influxdb_client import InfluxDBClient
from dotenv import load_dotenv
sys.path.insert(1, '../library')  # noqa: E402
import database_helper  # noqa: E402

In [None]:
load_dotenv()
#  DB Connection Parameter
#  A modification in /etc/hosts might be required
url = 'http://tig.influxdb.local'
token = os.getenv('INFLUXDB2_TOKEN')
org = os.getenv('INFLUXDB2_ORGANIZATION')
bucket = os.getenv('INFLUXDB2_BUCKET')

client = InfluxDBClient(url=url, token=token, org=org)
query_api = client.query_api()
dataframe = database_helper.query_data(query_range=0.2)

In [None]:
print("Database Query returned: " + str(len(dataframe)) + " elements!")
display('Type: ' + str(type(dataframe)),
        'Dataframe Shape: ' + str(dataframe.shape), dataframe.head())

In [None]:
# TODO: use polymomial instead of the default linear interpolation
dataframe = dataframe.interpolate()
cols_to_drop = dataframe.columns[dataframe.columns.str.contains('forecast')]
dataframe.drop(cols_to_drop, axis=1, inplace=True)
print(dataframe.dtypes.value_counts())
display(dataframe.select_dtypes(include='float64').head(5))
display(dataframe.select_dtypes(include='object').head(5))
display(dataframe.select_dtypes(include='datetime64[ns, UTC]').head(5))
display(dataframe.select_dtypes(include='int64').head(5))

dataframe = dataframe.select_dtypes(include='float64')

In [None]:
nan_or_zero_present = dataframe.isnull().values.any() \
    or (dataframe == 0).values.any()
print("Are there any NaN or zero values in the DataFrame?")
print(nan_or_zero_present)

total_nan_or_zero_count = dataframe.isnull().sum().sum() \
    + (dataframe == 0).sum().sum()
print("\nTotal number of NaN or zero values in the DataFrame:")
print(total_nan_or_zero_count)

In [None]:
corr_matrix = dataframe.corr()
print(type(corr_matrix))
display(corr_matrix)
display(sns.heatmap(corr_matrix))

In [None]:
all_nan_columns = corr_matrix.columns[corr_matrix.isna().all()]
df_without_all_nan = corr_matrix.drop(columns=all_nan_columns)
display(sns.heatmap(df_without_all_nan))

In [None]:
display(dataframe[all_nan_columns])
display(dataframe.drop(columns=all_nan_columns))
print(all_nan_columns)

In [None]:
# Identify columns with only one unique value (excluding NaN)
one_value_columns = dataframe.columns[dataframe.nunique(dropna=True) == 1]
print("\nColumns that only have one unique value:")
display(dataframe[one_value_columns])
display(dataframe.drop(columns=one_value_columns))
feature_selection_df = dataframe.drop(columns=one_value_columns)
display(sns.heatmap(feature_selection_df.corr()))

print(type(feature_selection_df.columns), feature_selection_df.columns)

In [None]:
with open('selected_columns.pkl', 'wb') as f:
    pickle.dump(feature_selection_df.columns.to_list(), f)
