In [None]:
# === Pacotes necessários / Required packages ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.spatial.distance import mahalanobis

# Sem classe indexada / No indexed class

In [None]:
# ===== 1. Ler os dados .XLSX / Read .XLSX data =====
df = pd.read_excel("NAME.xlsx")
print(df.head())

In [None]:
# === Ler os dados .CSV / Read .CSV data ===
df = pd.read_csv('NAME.csv', sep=';')  # especifica o separador
print(df.head())

In [None]:
# Selecionar apenas colunas numéricas / Select only numeric columns
df = df.select_dtypes(include=[np.number])

print(df.head())

In [None]:
# === Normalizar e padronizar / Normalize and standardize ===
scaler_std = StandardScaler()
df_pad = pd.DataFrame(scaler_std.fit_transform(df), columns=df.columns)

df_norm = df.copy()
for col in df.columns:
    x = df[col]
    df_norm[col] = (x - x.min()) / (x.max() - x.min())

In [None]:
# === Plotagem de histogramas / Histogram plotting ===
for col in df.columns:
    x = df[col].dropna()

    plt.figure()
    plt.hist(x, bins=20, density=True, color="blue", alpha=0.7)

    media = x.mean()
    desvio = x.std()

    xx = np.linspace(x.min(), x.max(), 100)
    yy = (1/(desvio*np.sqrt(2*np.pi))) * np.exp(-0.5*((xx-media)/desvio)**2)
    plt.plot(xx, yy, color="red", linewidth=2)

    plt.title(f"Hist of {col}")
    plt.xlabel(col)
    plt.ylabel("Density")
    plt.show()


In [None]:
# === Gráficos de dispersão / Scatter Plots ===
for col in df.columns:
    x = df[col]
    media = x.mean()
    desvio = x.std()
    limite_sup = media + 3*desvio
    limite_inf = media - 3*desvio

    ylim_inf = min(limite_inf, x.min())
    ylim_sup = max(limite_sup, x.max())

    plt.figure()
    plt.scatter(range(len(x)), x, color="blue", s=20)
    plt.axhline(media, color="red", linewidth=2)
    plt.axhline(limite_sup, color="red", linewidth=2)
    plt.axhline(limite_inf, color="red", linewidth=2)
    plt.ylim([ylim_inf, ylim_sup])

    plt.title(f"Dispersion of {col}")
    plt.xlabel("Observation index")
    plt.ylabel(col)
    plt.show()

In [None]:
# === Limpeza de outliers univariados / Univariate outlier cleaning ===
df_cleaned = df.copy()
for col in df.columns:
    media = df_cleaned[col].mean()
    desvio = df_cleaned[col].std()
    limite_sup = media + 3*desvio
    limite_inf = media - 3*desvio
    df_cleaned = df_cleaned[(df_cleaned[col] >= limite_inf) & (df_cleaned[col] <= limite_sup)]

In [None]:
# === Outliers Multivariados (Distância de Mahalanobis) / Multivariate Outliers (Mahalanobis Distance) ===

# Calcular médias e matriz de covariância dos dados limpos / Calculate means and covariance matrix of cleaned data
medias_sdi = df_cleaned.mean().values
cov_sdi = np.cov(df_cleaned.values, rowvar=False)
inv_cov_sdi = np.linalg.inv(cov_sdi)

# Calcular distância de Mahalanobis para cada observação / Calculate Mahalanobis distance for each observation
sdi = []
for i in range(len(df_cleaned)):
    x = df_cleaned.iloc[i].values
    dist = mahalanobis(x, medias_sdi, inv_cov_sdi)
    sdi.append(dist)
sdi = np.array([mahalanobis(x, medias_sdi, inv_cov_sdi)**2 for x in df_cleaned.values])

# Estatísticas da distância / Distance statistics
medias_sdi_val = sdi.mean()
desvpad_sdi = sdi.std()
cortesup = medias_sdi_val + 3*desvpad_sdi
corteinf = medias_sdi_val - 3*desvpad_sdi

# Plotar as distâncias / Plot the distances
plt.figure()
plt.scatter(range(len(sdi)), sdi, c="blue", s=20)
plt.axhline(medias_sdi_val, color="red", linewidth=3)
plt.axhline(cortesup, color="red", linestyle="--", linewidth=2)
plt.axhline(corteinf, color="red", linestyle="--", linewidth=2)
plt.title("Statistical Distances - Mahalanobis")
plt.xlabel("Observation")
plt.ylabel("Statistical Distances")
plt.show()

# Remover outliers multivariados / Remove multivariate outliers
df_cleaned = df_cleaned[sdi <= cortesup]

In [None]:
# === Visualização pós-limpeza / Post-cleaning view ===
for col in df_cleaned.columns:
    x = df_cleaned[col]
    media = x.mean()
    desvio = x.std()
    limite_sup = media + 3*desvio
    limite_inf = media - 3*desvio

    ylim_inf = min(limite_inf, x.min())
    ylim_sup = max(limite_sup, x.max())

    plt.figure()
    plt.scatter(range(len(x)), x, color="blue", s=20)
    plt.axhline(media, color="red", linewidth=2)
    plt.axhline(limite_sup, color="red", linewidth=2)
    plt.axhline(limite_inf, color="red", linewidth=2)
    plt.ylim([ylim_inf, ylim_sup])

    plt.title(f"Dispersion of {col} (clean data)")
    plt.xlabel("Observation index")
    plt.ylabel(col)
    plt.show()


In [None]:
# === Salvar em Excel / Save to Excel ===
df_cleaned.to_excel("Dados_limpos.xlsx", index=False)


In [None]:
# === Salvar em CSV / Save to CSV ===
df_cleaned.to_csv("Dados_limpos.csv", index=False)

# Com classe indexada / With indexed class

In [None]:
# ===== 1. Ler os dados .XLSX / Read .XLSX data =====
df = pd.read_excel("generated_data.xlsx")
print(df.head())

In [None]:
# === Ler os dados .CSV / Read .CSV data ===
df = pd.read_csv('generated_data.csv', sep=';')  # especifica o separador / specifies the separator
print(df.head())

In [None]:
# Separar classe e dados numéricos / Separate class and numeric data
classe = df.iloc[:, 0]
X = df.iloc[:, 1:].select_dtypes(include=[np.number])

print(X.head())

In [None]:
# === Normalizar e padronizar / Normalize and standardize ===
scaler_std = StandardScaler()
df_pad = pd.DataFrame(scaler_std.fit_transform(X), columns=X.columns)

df_norm = X.copy()
for col in X.columns:
    x = X[col]
    df_norm[col] = (x - x.min()) / (x.max() - x.min())

In [None]:
# === Plotagem de histogramas / Histogram plotting ===
for col in X.columns:
    x = X[col].dropna()
    plt.figure()
    plt.hist(x, bins=20, density=True, color="blue", alpha=0.7)
    media = x.mean()
    desvio = x.std()
    xx = np.linspace(x.min(), x.max(), 100)
    yy = (1/(desvio*np.sqrt(2*np.pi))) * np.exp(-0.5*((xx-media)/desvio)**2)
    plt.plot(xx, yy, color="red", linewidth=2)
    plt.title(f"Hist of {col}")
    plt.xlabel(col)
    plt.ylabel("Density")
    plt.show()

In [None]:
# === Gráficos de dispersão / Scatter Plots ===
for col in X.columns:
    x = X[col]
    media = x.mean()
    desvio = x.std()
    limite_sup = media + 3*desvio
    limite_inf = media - 3*desvio
    ylim_inf = min(limite_inf, x.min())
    ylim_sup = max(limite_sup, x.max())
    plt.figure()
    plt.scatter(range(len(x)), x, color="blue", s=20)
    plt.axhline(media, color="red", linewidth=2)
    plt.axhline(limite_sup, color="red", linewidth=2)
    plt.axhline(limite_inf, color="red", linewidth=2)
    plt.ylim([ylim_inf, ylim_sup])
    plt.title(f"Dispersion of {col}")
    plt.xlabel("Observation index")
    plt.ylabel(col)
    plt.show()

In [None]:
# === Limpeza de outliers univariados / Univariate outlier cleaning ===
df_cleaned = X.copy()
for col in X.columns:
    media = df_cleaned[col].mean()
    desvio = df_cleaned[col].std()
    limite_sup = media + 3*desvio
    limite_inf = media - 3*desvio
    df_cleaned = df_cleaned[(df_cleaned[col] >= limite_inf) & (df_cleaned[col] <= limite_sup)]

In [None]:
# === Outliers Multivariados (Distância de Mahalanobis) / Multivariate Outliers (Mahalanobis Distance) ===

# Calcular médias e matriz de covariância dos dados limpos / Calculate means and covariance matrix of cleaned data
medias_sdi = df_cleaned.mean().values
cov_sdi = np.cov(df_cleaned.values, rowvar=False)
inv_cov_sdi = np.linalg.inv(cov_sdi)

# Calcular distância de Mahalanobis para cada observação / Calculate Mahalanobis distance for each observation
sdi = []
for i in range(len(df_cleaned)):
    x = df_cleaned.iloc[i].values
    dist = mahalanobis(x, medias_sdi, inv_cov_sdi)
    sdi.append(dist)
sdi = np.array([mahalanobis(x, medias_sdi, inv_cov_sdi)**2 for x in df_cleaned.values])

# Estatísticas da distância / Distance statistics
medias_sdi_val = sdi.mean()
desvpad_sdi = sdi.std()
cortesup = medias_sdi_val + 3*desvpad_sdi
corteinf = medias_sdi_val - 3*desvpad_sdi

# Plotar as distâncias / Plot the distances
plt.figure()
plt.scatter(range(len(sdi)), sdi, c="blue", s=20)
plt.axhline(medias_sdi_val, color="red", linewidth=3)
plt.axhline(cortesup, color="red", linestyle="--", linewidth=2)
plt.axhline(corteinf, color="red", linestyle="--", linewidth=2)
plt.title("Statistical Distances (Mahalanobis)")
plt.xlabel("Observation")
plt.ylabel("Statistical distance")
plt.show()

# Remover outliers multivariados / Remove multivariate outliers
df_cleaned = df_cleaned[sdi <= cortesup]

In [None]:
# === Visualização pós-limpeza / Post-cleaning view ===
for col in df_cleaned.columns:
    x = df_cleaned[col]
    media = x.mean()
    desvio = x.std()
    limite_sup = media + 3*desvio
    limite_inf = media - 3*desvio
    ylim_inf = min(limite_inf, x.min())
    ylim_sup = max(limite_sup, x.max())
    plt.figure()
    plt.scatter(range(len(x)), x, color="blue", s=20)
    plt.axhline(media, color="red", linewidth=2)
    plt.axhline(limite_sup, color="red", linewidth=2)
    plt.axhline(limite_inf, color="red", linewidth=2)
    plt.ylim([ylim_inf, ylim_sup])
    plt.title(f"Dispersion of {col} (Clean Data)")
    plt.xlabel("Observation index")
    plt.ylabel(col)
    plt.show()

In [None]:
# === Adicionar coluna de classe de volta / Add class column back ===
df_cleaned["Classe"] = classe.loc[df_cleaned.index]

# Reorder columns to put 'Classe' as the first column
cols = ['Classe'] + [col for col in df_cleaned.columns if col != 'Classe']
df_cleaned = df_cleaned[cols]

In [None]:
# === Salvar em Excel / Save to Excel ===
df_cleaned.to_excel("Dados_limpos.xlsx", index=False)

In [None]:
# === Salvar em CSV / Save to CSV ===
df_cleaned.to_csv("Dados_limpos.csv", index=False)