In [14]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Set up directory paths
main_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/raw"
plots_dir_quantiles = "C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/feature quantile label split"
plots_dir_histograms = "C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/histograms"

# Create directories if they don't exist
Path(plots_dir_quantiles).mkdir(parents=True, exist_ok=True)
Path(plots_dir_histograms).mkdir(parents=True, exist_ok=True)

# Read the data
df_path = os.path.join(main_dir, "final_combined_data_with_metadata.csv")
df = pd.read_csv(df_path)

# Ensure all columns are numbers for calculation
df = df.apply(pd.to_numeric, errors='ignore')

# Remove categorical columns
numeric_features = df.columns.difference(['phase', 'Phase', 'Subject', 'Story'])

def plot_feature_quantile_split(df, numeric_features, plots_dir_quantiles):
    num_features = len(numeric_features)
    grid_size_quantiles = 5  # Number of features per grid for quantile plots
    num_grids_quantiles = int(np.ceil(num_features / grid_size_quantiles))

    for i in range(num_grids_quantiles):
        selected_features = numeric_features[i*grid_size_quantiles:(i+1)*grid_size_quantiles]
        num_selected = len(selected_features)

        # Create figure for the current grid
        fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(25, 5))  # Adjust 1x5 grid
        ax = ax.flatten()

        for j, feature in enumerate(selected_features):
            # Compute quartiles
            quantiles = df[feature].quantile([0, 0.25, 0.50, 0.75, 1.0])
            bins = [quantiles[0], quantiles[0.25], quantiles[0.50], quantiles[0.75], quantiles[1.0]]

            # Format bin labels with 4 decimal places
            bin_labels = [f"{x:.4f}" for x in bins]

            # Bin data
            df['quantile_bin'] = pd.cut(df[feature], bins=bins, labels=bin_labels[:-1], include_lowest=True)

            # Calculate label distribution proportions
            label_counts = df.groupby(['quantile_bin', 'phase']).size()
            bin_totals = label_counts.groupby(level=0).sum()
            label_proportions = (label_counts / bin_totals).unstack().fillna(0)

            # Plot stacked bar chart
            label_proportions.plot(
                kind='bar', stacked=True, ax=ax[j], alpha=0.75, width=0.8
            )
            ax[j].set_title(feature)
            ax[j].set_xlabel('Quartile Range')
            ax[j].set_ylabel('Proportion within Quartile')
            ax[j].legend(title='Phase', bbox_to_anchor=(1, 1))

        # Hide any unused subplots
        for j in range(num_selected, len(ax)):
            ax[j].axis('off')

        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(os.path.join(plots_dir_quantiles, f"feature_quantile_split_grid_{i+1}.png"), bbox_inches='tight')
        plt.close()

def plot_feature_histograms(df, numeric_features, plots_dir_histograms):
    num_features = len(numeric_features)
    grid_size_histograms = 8  # Number of features per grid for histograms
    num_grids_histograms = int(np.ceil(num_features / grid_size_histograms))

    for i in range(num_grids_histograms):
        selected_features = numeric_features[i*grid_size_histograms:(i+1)*grid_size_histograms]
        num_selected = len(selected_features)

        # Create figure for the current grid
        fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))  # Adjust 2x4 grid
        ax = ax.flatten()

        for j, feature in enumerate(selected_features):
            # Plot histogram
            ax[j].hist(df[feature].dropna(), bins=30, alpha=0.75)
            ax[j].set_title(feature)
            ax[j].set_xlabel('Value')
            ax[j].set_ylabel('Frequency')

        # Hide any unused subplots
        for j in range(num_selected, len(ax)):
            ax[j].axis('off')

        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(os.path.join(plots_dir_histograms, f"feature_histograms_grid_{i+1}.png"), bbox_inches='tight')
        plt.close()

# Execute the plotting functions
plot_feature_quantile_split(df, numeric_features, plots_dir_quantiles)
plot_feature_histograms(df, numeric_features, plots_dir_histograms)

  df = df.apply(pd.to_numeric, errors='ignore')
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'phase']).size()
  bin_totals = label_counts.groupby(level=0).sum()
  label_counts = df.groupby(['quantile_bin', 'ph