In [None]:
import sys
sys.path.append("../src")

from encoder import detect_feature_types, auto_encode
from scaler import detect_numerical, apply_standard_scaling, scaling_report, save_scaled_data
from feature_utils import clean_text_columns, standardize_column_names

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# FUNCTION 1 — Save matplotlib plots
def save_plot(fig, filename):
    save_dir = "../outputs/screenshots"
    os.makedirs(save_dir, exist_ok=True)
    fig.savefig(f"{save_dir}/{filename}", dpi=300, bbox_inches='tight')
    plt.close(fig)

# FUNCTION 2 — Save text output (list, dict, df head, etc.)
def save_text_output(text, filename):
    save_dir = "../outputs/screenshots"
    os.makedirs(save_dir, exist_ok=True)
    with open(f"{save_dir}/{filename}", "w") as f:
        f.write(str(text))


In [None]:
df = pd.read_csv("../data/adult.csv")

# Clean text formatting
df = clean_text_columns(df)

# Standardize column names
df = standardize_column_names(df)

# Save the cleaned df.head() as text
save_text_output(df.head(), "head_output.txt")

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
numerical, categorical = detect_feature_types(df)

save_text_output(
    {"numerical": numerical, "categorical": categorical},
    "feature_types.txt"
)

numerical, categorical

(['age',
  'fnlwgt',
  'educational-num',
  'capital-gain',
  'capital-loss',
  'hours-per-week'],
 ['workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'gender',
  'native-country',
  'income'])

In [5]:
df_encoded, label_cols, onehot_cols = auto_encode(df)

save_text_output(
    {"label_encoded": label_cols, "onehot_encoded": onehot_cols},
    "encoding_columns.txt"
)

df_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,relationship,race,gender,capital-gain,capital-loss,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,4,226802,7,4,3,2,1,0,0,...,False,False,False,False,False,False,False,True,False,False
1,38,4,89814,9,2,0,4,1,0,0,...,False,False,False,False,False,False,False,True,False,False
2,28,2,336951,12,2,0,4,1,0,0,...,False,False,False,False,False,False,False,True,False,False
3,44,4,160323,10,2,0,2,1,7688,0,...,False,False,False,False,False,False,False,True,False,False
4,18,0,103497,10,4,3,4,0,0,0,...,False,False,False,False,False,False,False,True,False,False


In [6]:
numerical_encoded = detect_numerical(df_encoded)

save_text_output(numerical_encoded, "numerical_after_encoding.txt")

numerical_encoded

['age',
 'workclass',
 'fnlwgt',
 'educational-num',
 'marital-status',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'income']

In [7]:
df_scaled, scaler = apply_standard_scaling(df_encoded, numerical_encoded)

save_text_output(df_scaled.head(), "scaled_head_output.txt")
df_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,relationship,race,gender,capital-gain,capital-loss,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,-0.995129,0.088484,0.351675,-1.197259,0.916138,0.971649,-1.971746,0.70422,-0.144804,-0.217127,...,False,False,False,False,False,False,False,True,False,False
1,-0.046942,0.088484,-0.945524,-0.419335,-0.410397,-0.900852,0.392384,0.70422,-0.144804,-0.217127,...,False,False,False,False,False,False,False,True,False,False
2,-0.776316,-1.277432,1.394723,0.74755,-0.410397,-0.900852,0.392384,0.70422,-0.144804,-0.217127,...,False,False,False,False,False,False,False,True,False,False
3,0.390683,0.088484,-0.277844,-0.030373,-0.410397,-0.900852,-1.971746,0.70422,0.886874,-0.217127,...,False,False,False,False,False,False,False,True,False,False
4,-1.505691,-2.643348,-0.815954,-0.030373,0.916138,0.971649,0.392384,-1.42001,-0.144804,-0.217127,...,False,False,False,False,False,False,False,True,False,False


In [8]:
report = scaling_report(df_encoded, df_scaled, numerical_encoded)

save_text_output(report, "scaling_report.txt")
report

{'age': {'Before Min': np.int64(17),
  'Before Max': np.int64(90),
  'After Min': np.float64(-1.578628884921691),
  'After Max': np.float64(3.7458081832961794)},
 'workclass': {'Before Min': np.int64(0),
  'Before Max': np.int64(8),
  'After Min': np.float64(-2.6433478829717005),
  'After Max': np.float64(2.8203168356107704)},
 'fnlwgt': {'Before Min': np.int64(12285),
  'Before Max': np.int64(1490400),
  'After Min': np.float64(-1.6796798205441077),
  'After Max': np.float64(12.317231053897183)},
 'educational-num': {'Before Min': np.int64(1),
  'Before Max': np.int64(16),
  'After Min': np.float64(-3.5310298194574443),
  'After Max': np.float64(2.303397451736234)},
 'marital-status': {'Before Min': np.int64(0),
  'Before Max': np.int64(6),
  'After Min': np.float64(-1.7369321712502335),
  'After Max': np.float64(2.2426733691838656)},
 'relationship': {'Before Min': np.int64(0),
  'Before Max': np.int64(5),
  'After Min': np.float64(-0.9008520968335757),
  'After Max': np.float64(2.21

In [9]:
save_scaled_data(df_scaled, "../outputs/processed/adult_processed.csv")

print("Processed dataset saved successfully!")

Processed dataset saved successfully!
