In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
from scipy.stats import shapiro, anderson, boxcox
import numpy as np
import math
import seaborn as sns

In [None]:
dir = 'dataset/'
index = [1,2,3,4,5,6,7,8,9]
dfs = []
for i in index:
  df = pd.read_csv(dir+'base'+str(i)+'.csv')
  dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

In [None]:
def printBoxPlot(df_data, l_remove):
    num_columns = len(df_data.columns) - len(l_remove) # remove label and SMA pressure forecast
    num_rows = math.ceil(num_columns / 3)
    fig, axs = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
    axs = axs.flatten()

    k = 0
    for i, column in enumerate(df_data.columns):
        if column not in l_remove:
            axs[i-k].boxplot(df_data[column].dropna())
            axs[i-k].set_title(f'{column}')
            axs[i-k].set_xlabel(f'{column}')
        else:
            k += 1

    for j in range(i-k+1, len(axs)):
        fig.delaxes(axs[j])

    plt.tight_layout()

    # Exibindo o gráfico
    plt.show()

In [None]:
printBoxPlot(df_data=df_all, l_remove=['label'])

In [None]:
print(f"Descriptive statistics for all dataset:")
df_all.describe()
print("\n" + "="*50 + "\n")

for label in [0, 1, 2]:
    print(f"Descriptive statistics for label {label}:")
    print(df_all[df_all['label'] == label].describe())
    print("\n" + "="*50 + "\n")

In [None]:
skewness = df_all.skew()
kurtosis = df_all.kurtosis()
stats = pd.DataFrame({'Skewness': skewness, 'Kurtosis': kurtosis})
stats

In [None]:
def printHistogram(df_data, l_remove):
    num_columns = len(df_data.columns)- len(l_remove)
    num_rows = math.ceil(num_columns / 3)
    fig, axs = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
    axs = axs.flatten()
    k = 0
    for i, column in enumerate(df_data.columns):
        if column not in l_remove:
            axs[i-k].hist(df_data[column].dropna(), bins=30, edgecolor='black')
            axs[i-k].set_title(f'{column}')
            axs[i-k].set_xlabel(f'{column}')
            axs[i-k].set_ylabel('Frequency')
        else:
            k +=1

    for j in range(i-k+1, len(axs)):
        fig.delaxes(axs[j])

    plt.tight_layout()

    plt.show()

printHistogram(df_data=df_all, l_remove=['label', 'SMA pressure forecast'])

In [None]:
result = []

for column in df_all_flow.columns:
    stat, p = shapiro(df_all_flow[column].dropna())
    result.append({'Column': column, 'Stat': stat, 'p-value': p, 'Normal': p > 0.05})

result_df = pd.DataFrame(result)

print(result_df)

In [None]:
def plotCorrelation(correlation_matrix):
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True, fmt='.2f')
    plt.title('Matriz de Correlação Geral')
    plt.show()

plotCorrelation(df_all.corr())

In [None]:
dataframes = []

for idx in idBase:
    df = pd.read_csv(dir+'base'+str(idx)+'.csv')
    df = df.drop(['number data'], axis=1)
    dataframes.append(df)

total_df = pd.concat(dataframes, ignore_index=True)

min_values = total_df.min()
max_values = total_df.max()
normalized_dfs = [(df - min_values) / (max_values - min_values) for df in dataframes]

description_map = {0: 'Activation', 1: 'Reaction', 2: 'Irregularity'}

In [None]:
def plot_data_1(dataset_idx, df, normalized_df):
    plt.figure(figsize=(10, 5))

    base_cmap = plt.colormaps.get_cmap('tab10')
    colors = ListedColormap(base_cmap.colors[:len(normalized_df.columns)])

    for idx, col in enumerate(normalized_df.columns):
        if col != 'label':
            plt.plot(normalized_df[col], label=col, color=colors(idx), alpha=0.7)

    previous_value = None
    first_label = True
    for i, value in enumerate(df['label']):
        if first_label:
            plt.text(i + 6, normalized_df.max().max(), description_map[value],
                     rotation=0, verticalalignment='bottom', horizontalalignment='left', fontsize=8, color='black')
            first_label = False
        elif previous_value is not None and value != previous_value:
            plt.axvline(x=i, color='black', linestyle='--', alpha=0.5)
            plt.text(i + 6, normalized_df.max().max(), description_map[value],
                     rotation=0, verticalalignment='bottom', horizontalalignment='left', fontsize=8, color='black')

        previous_value = value

    plt.grid(True, linestyle='--', alpha=0.7)

    plt.title(f'Dataset {dataset_idx:02d}')
    plt.xlabel('Number Data')
    plt.ylabel('Value')
    plt.legend(bbox_to_anchor=(1.0, 1), loc='upper left', fontsize=8)
    plt.tight_layout()
    plt.show()

for idx, (dataset_idx, df, normalized_df) in enumerate(zip(dfCluster['id'], dfCluster['dfDisp'], dfCluster['dataNorm'])):
  plot_data_1(dataset_idx, df, normalized_df)

In [None]:
def plot_data_2(dataset_idx, df, normalized_df):
    plt.figure(figsize=(10, 5))

    base_cmap = plt.colormaps.get_cmap('tab10')
    colors = ListedColormap(base_cmap.colors[:len(normalized_df.columns)])

    for idx, col in enumerate(normalized_df.columns):
        if col != 'label':
            plt.scatter(range(len(normalized_df[col])), normalized_df[col], label=col, color=colors(idx), s=1, alpha=0.7)

    previous_value = None
    first_label = True
    for i, value in enumerate(df['label']):
        if first_label:
            plt.text(i + 6, normalized_df.max().max(), description_map[value],
                     rotation=0, verticalalignment='bottom', horizontalalignment='left', fontsize=8, color='black')
            first_label = False
        elif previous_value is not None and value != previous_value:
            plt.axvline(x=i, color='black', linestyle='--', alpha=0.5)
            plt.text(i + 6, normalized_df.max().max(), description_map[value],
                     rotation=0, verticalalignment='bottom', horizontalalignment='left', fontsize=8, color='black')

        previous_value = value

    plt.grid(True, linestyle='--', alpha=0.7)

    plt.title(f'Dataset {dataset_idx:02d}')
    plt.xlabel('Number Data')
    plt.ylabel('Value')
    plt.legend(bbox_to_anchor=(1.0, 1), loc='upper left', fontsize=8)
    plt.tight_layout()
    plt.show()

for idx, (dataset_idx, df, normalized_df) in enumerate(zip([1, 2, 3, 4, 5, 6, 7, 8, 9], dataframes, normalized_dfs)):
    plot_data_2(dataset_idx, df, normalized_df)