In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import pygwalker as pyg

In [5]:
train_df = pd.read_csv("./train_data.csv")
test_df = pd.read_csv("./test_data.csv")

In [None]:
correlation_matrix = train_df.corr()

# TODO: REMOVE OR MODIFY NON NUMERICAL DATA

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Matriz de correlaciones')
plt.show()

In [None]:
def test_normalidad(data, p_thres = 0.05):
    mean, std = norm.fit(data)

    if std == 0:
        return 'No normal', 1e-8

    normal = norm(loc = mean, scale = std)
    _, p_value = stats.kstest(data, normal.cdf)

    if p_value > p_thres:
        normality = "Normal"
    else:
        normality = "No normal"

    return normality, p_value

In [None]:
def dataframe_statistics(dataframe):
    data = dataframe.copy()

    resultados = {}

    for col in data.columns:
        # Check data type to handle non-numeric columns
        if pd.api.types.is_numeric_dtype(data[col]):
            data_stats = data[col].describe(percentiles=[0.25, 0.75])
            data_stats['skewness'] = skew(data[col])
            data_stats['kurtosis'] = kurtosis(data[col])
            data_stats['normalness'], _ = test_normalidad(data[col], 0.05)
            data_stats['uniques'] = data[col].nunique()
            data_stats['null count'] = data[col].isnull().sum()
            data_stats['data type'] = type(data[col][0])
            resultados[col] = data_stats
        else:
            resultados[col] = {'count': data[col].count(), 'unique': data[col].nunique(), 'top': data[col].mode().iloc[0]}

    # Display the results in a table format
    print(tabulate(pd.DataFrame(resultados), headers='keys', tablefmt='grid'))

In [None]:
def plot_function(data, type):
    num_plots = min(16, len(data.columns))
    num_cols = 4
    num_rows = (num_plots + num_cols - 1) // num_cols

    fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 10))
    axs = axs.ravel()

    for i, col in enumerate(data.select_dtypes(include=[np.number])): 
        if i >= num_plots:
            break
        if type == 'QQPlot': 
            sm.qqplot(data[col], line='r', ax=axs[i], fit=True)
            axs[i].set_xlabel("Theoretical Quantiles")
            axs[i].set_ylabel("Sample Quantiles")
            axs[i].set_title(f"{col}")
        elif type == 'BoxPlot':
            sns.boxplot(y=data[col], orient="v", ax=axs[i])  
            axs[i].set_xlabel("")
            axs[i].set_ylabel(col)
            axs[i].set_title(f"Boxplot de {col}")

    # Remove unused subplots
    for j in range(num_plots, num_rows * num_cols):
        fig.delaxes(axs[j])

    plt.tight_layout()
    plt.show()