In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os


In [5]:
# Replace 'data/input.csv' with your actual file path
file_path = 'data/input.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,age,gender,city,income,target
0,25,Male,New York,50000.0,1
1,30,Female,Los Angeles,60000.0,0
2,22,Female,Chicago,,1
3,28,Male,Houston,52000.0,0
4,35,,Phoenix,72000.0,1


In [6]:
# Identify features and target
if 'target' in df.columns:
    X = df.drop('target', axis=1)
    y = df['target']
else:
    X = df.copy()
    y = None

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [7]:
# Numeric pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Wrap in a pipeline
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])


In [8]:
X_processed = full_pipeline.fit_transform(X)

# If OneHotEncoder returns sparse matrix
if hasattr(X_processed, "toarray"):
    X_processed = X_processed.toarray()

X_processed[:5]  # show first few transformed rows


array([[-0.86865942, -0.89391653,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.02554881,  0.44695826,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [-1.40518436,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [-0.33213448, -0.62574157,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ],
       [ 0.91975703,  2.05600801,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ]])

In [9]:
# Convert back to DataFrame
processed_df = pd.DataFrame(X_processed)

# Add target column if it exists
if y is not None:
    processed_df['target'] = y.values

# Save to CSV
output_path = 'data/processed_output.csv'
processed_df.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")


Processed data saved to data/processed_output.csv
