In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Set up directory paths
main_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/raw"
output_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/processed"

# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Read the data
df_path = os.path.join(main_dir, "final_combined_data_with_metadata.csv")
df = pd.read_csv(df_path)

# Ensure all columns are treated as numeric where applicable
df = df.apply(pd.to_numeric, errors='ignore')

# Prepare data for outlier analysis
numeric_features = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(zscore(df[numeric_features].fillna(0)))

# Count outlier features per record
outlier_feature_count = (z_scores > 3).sum(axis=1)

# Datasets after dropping based on outlier thresholds
datasets = {
    "3_or_more_outliers_dropped": df[outlier_feature_count < 3],
    "2_or_more_outliers_dropped": df[outlier_feature_count < 2],
    "all_outliers_dropped": df[outlier_feature_count == 0]
}

# Save the datasets to the processed data folder
for key, dataset in datasets.items():
    dataset_path = os.path.join(output_dir, f"{key}.csv")
    dataset.to_csv(dataset_path, index=False)
    print(f"Dataset saved to {dataset_path}")

# Display sizes of the new datasets
for key, dataset in datasets.items():
    print(f"Dataset '{key}' size: {dataset.shape}")

  df = df.apply(pd.to_numeric, errors='ignore')


Dataset saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/data/processed\3_or_more_outliers_dropped.csv
Dataset saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/data/processed\2_or_more_outliers_dropped.csv
Dataset saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/data/processed\all_outliers_dropped.csv
Dataset '3_or_more_outliers_dropped' size: (8889, 55)
Dataset '2_or_more_outliers_dropped' size: (8227, 55)
Dataset 'all_outliers_dropped' size: (7529, 55)
