In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Set up directory paths
main_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/raw"
processed_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/processed"
output_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices"

# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Read the original data
df_path = os.path.join(main_dir, "final_combined_data_with_metadata.csv")
df = pd.read_csv(df_path)

# Exclude non-numerical columns and prepare data for outlier analysis
numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(['phase', 'Phase', 'Subject', 'Story'])
z_scores = np.abs(zscore(df[numerical_columns].fillna(0)))

# Count outlier features per record
outlier_feature_count = (z_scores > 3).sum(axis=1)

# Create datasets based on outlier thresholds
datasets = {
    "3_or_more_outliers_dropped": df[outlier_feature_count < 3],
    "2_or_more_outliers_dropped": df[outlier_feature_count < 2],
    "all_outliers_dropped": df[outlier_feature_count == 0]
}

# Function to plot correlation matrix
def plot_correlation_matrix(dataframe, title, save_path_png, save_path_pdf):
    # Calculating correlation matrix
    corr = dataframe.corr()

    # Plotting correlation matrix
    plt.figure(figsize=(12, 10))
    sns.set(style='white')
    
    ax = sns.heatmap(
        corr, annot=True, fmt=".2f", cmap='coolwarm',
        annot_kws={"size": 5},  # Adjust font size of numbers
        cbar_kws={"shrink": .8}, vmin=-1, vmax=1, center=0
    )
    ax.set_title(title)
    
    # Save the plot as PNG and PDF
    plt.tight_layout()
    plt.savefig(save_path_png, bbox_inches='tight')
    plt.savefig(save_path_pdf, format='pdf', bbox_inches='tight')
    plt.close()

# Generate correlation matrices for all datasets
for name, dataset in datasets.items():
    processed_data = dataset[numerical_columns]
    plot_title = f'Correlation Matrix ({name.replace("_", " ").title()})'
    plot_save_path_png = os.path.join(output_dir, f'{name}_correlation_matrix.png')
    plot_save_path_pdf = os.path.join(output_dir, f'{name}_correlation_matrix.pdf')
    
    plot_correlation_matrix(processed_data, plot_title, plot_save_path_png, plot_save_path_pdf)

    print(f"Correlation matrix plot saved to {plot_save_path_png} and {plot_save_path_pdf}")

Correlation matrix plot saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices\3_or_more_outliers_dropped_correlation_matrix.png and C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices\3_or_more_outliers_dropped_correlation_matrix.pdf
Correlation matrix plot saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices\2_or_more_outliers_dropped_correlation_matrix.png and C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices\2_or_more_outliers_dropped_correlation_matrix.pdf
Correlation matrix plot saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices\all_outliers_dropped_correlation_matrix.png and C:/Users/reicd/Downloads/MY_ML_PROJECT/plots/CorrelationMatrices\all_outliers_dropped_correlation_matrix.pdf
