In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy import stats

# Generate synthetic dataset for testing
np.random.seed(42)
n_samples = 100
data = {
    'income': np.random.normal(50000, 15000, n_samples),
    'credit_score': np.random.normal(650, 50, n_samples),
    'job_title': np.random.choice(['Engineer', 'Teacher', 'Doctor', 'Artist'], n_samples),
    'target': np.random.choice([0, 1], n_samples)
}

# Introduce missing values and outliers for testing
data['income'][np.random.randint(0, n_samples, 5)] = np.nan
data['credit_score'][np.random.randint(0, n_samples, 3)] = np.nan
data['income'][np.random.randint(0, n_samples, 2)] = 150000  # Outliers
df = pd.DataFrame(data)

In [23]:
# Handle missing values by filling with median
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)


In [34]:
# Remove duplicates
df.drop_duplicates(inplace=True)


# 1) Build a float array of your incomes
income = df['income'].to_numpy(dtype=float)

# 2) Allocate an output array full of NaNs
log_out = np.full_like(income, np.nan, dtype=float)

# 3) Compute log1p **only** where income > –1
mask = (income > -1) & ~np.isnan(income)
np.log1p(income, out=log_out, where=mask)

# 4) Stick back into your DataFrame
df['income_log'] = log_out

In [35]:
scaler = StandardScaler()
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [36]:
# One-hot encode categorical features
df = pd.get_dummies(df, drop_first=True)

In [37]:
# Detect and remove outliers using Z-score
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64'])))
df = df[(z_scores < 3).all(axis=1)]

In [38]:
# Apply log transformation to skewed data
df['income_log'] = np.log1p(df['income'])

In [None]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)