In [4]:
import os
import pandas as pd
import numpy as np  # Ensure numpy is imported
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Paths to narrowed datasets and prepared data directory
narrowed_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/narrowed"
prepared_dir = "C:/Users/reicd/Downloads/MY_ML_PROJECT/data/prepared"
os.makedirs(prepared_dir, exist_ok=True)

# Dataset files
files = [
    "narrowed_3_or_more_outliers_dropped.csv",
    "narrowed_2_or_more_outliers_dropped.csv",
    "narrowed_all_outliers_dropped.csv"
]

# Function to process each file
def process_dataset(file_path, filename):
    # Read the data
    df = pd.read_csv(file_path)

    # Drop rows with NA values
    df.dropna(inplace=True)

    # Drop 'phase' column if exists, keep 'Phase'
    if 'phase' in df.columns:
        df.drop(columns=['phase'], inplace=True)
    
    # Identify categorical columns
    categorical_columns = ['Subject', 'Story', 'Phase']
    
    # One-hot encode categorical features
    df_encoded = pd.get_dummies(df, columns=categorical_columns)

    # Standardize numerical features
    numerical_columns = df_encoded.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])
    
    # Save the prepared dataset
    save_path = os.path.join(prepared_dir, f"prepared_{filename}")
    df_encoded.to_csv(save_path, index=False)
    print(f"Prepared dataset saved to {save_path}")

# Process each narrowed dataset
for file in files:
    file_path = os.path.join(narrowed_dir, file)
    process_dataset(file_path, file)

print("Data preprocessing and preparation complete.")

Prepared dataset saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/data/prepared\prepared_narrowed_3_or_more_outliers_dropped.csv
Prepared dataset saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/data/prepared\prepared_narrowed_2_or_more_outliers_dropped.csv
Prepared dataset saved to C:/Users/reicd/Downloads/MY_ML_PROJECT/data/prepared\prepared_narrowed_all_outliers_dropped.csv
Data preprocessing and preparation complete.
