In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, zscore

# Set up directory paths
main_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/raw"

# Read the data
df_path = os.path.join(main_dir, "final_combined_data_with_metadata.csv")
df = pd.read_csv(df_path)

# Ensure all columns are numbers where possible
df = df.apply(pd.to_numeric, errors='ignore')

# Lists to store results
numerical_summary = []
categorical_summary = []

# Numeric Features
numeric_features = df.select_dtypes(include=[np.number]).columns.difference(['phase', 'Phase'])
for feature in numeric_features:
    # Calculate basic statistics
    feature_data = df[feature].dropna()
    z_scores = zscore(feature_data)
    
    highest_z = z_scores.max()
    lowest_z = z_scores.min()
    
    # Average Z-scores of top/bottom 300 values
    sorted_indices = np.argsort(z_scores)
    top_300_average = z_scores[sorted_indices[-300:]].mean()
    bottom_300_average = z_scores[sorted_indices[:300]].mean()
    
    summary = {
        'Feature': feature,
        'Mean': feature_data.mean(),
        'Median': feature_data.median(),
        'Std': feature_data.std(),
        'Min': feature_data.min(),
        'Max': feature_data.max(),
        'Skewness': skew(feature_data),
        'Kurtosis': kurtosis(feature_data),
        'Highest Z-Score': highest_z,
        'Lowest Z-Score': lowest_z,
        'Avg Z-Score Top 300': top_300_average,
        'Avg Z-Score Bottom 300': bottom_300_average
    }
    numerical_summary.append(summary)

# Create DataFrame for numerical features
numerical_summary_df = pd.DataFrame(numerical_summary)

# Categorical Features
categorical_features = df.select_dtypes(include=[object]).columns.union(['phase', 'Phase'])
for feature in categorical_features:
    feature_data = df[feature].dropna()
    mode = feature_data.mode()[0]
    value_counts = feature_data.value_counts(normalize=True) * 100
    
    summary = {
        'Feature': feature,
        'Unique Values': feature_data.nunique(),
        'Mode': mode,
        'Distribution': value_counts.to_dict()
    }
    categorical_summary.append(summary)

# Create DataFrame for categorical features
categorical_summary_df = pd.DataFrame(categorical_summary)

# Print summaries
print("Numerical Feature Summary")
print(numerical_summary_df.to_string(index=False))

print("\nCategorical Feature Summary")
for summary in categorical_summary:
    print(f"\nFeature: {summary['Feature']}")
    print(f"Unique Values: {summary['Unique Values']}")
    print(f"Mode: {summary['Mode']}")
    print("Distribution:")
    for value, percentage in summary['Distribution'].items():
        print(f"  {value}: {percentage:.2f}%")