In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif

from imblearn.over_sampling import SMOTE

In [None]:
# ===================== Basic Config (only modify this) =====================
# Do not include the .csv suffix
TARGET_FILE = 'event_7-abortion_finger'   # <<< Change this to your file prefix
CSV_PATH = TARGET_FILE + '.csv'
# ===========================================================================


### Preprocessing

In [None]:
# Load data
Data = pd.read_csv(CSV_PATH)
print('Raw shape:', Data.shape)

# Count active/inactive samples (last column is label)
num_active = (Data.iloc[:, -1] != 0).sum()
num_inactive = (Data.iloc[:, -1] == 0).sum()
print('number of active: ', num_active)
print('number of inactive: ', num_inactive)

# Drop columns containing '#NAME?'
mask_bad = Data.apply(lambda c: (c == '#NAME?').any())
bad_cols = Data.columns[mask_bad].tolist()
if bad_cols:
    print('Drop columns containing #NAME?:', bad_cols)
    Data = Data.drop(columns=bad_cols)

# Convert all possible columns to numeric, others to NaN
Data_num = Data.apply(pd.to_numeric, errors='coerce')

# Drop all-NaN columns
all_nan_cols = [c for c in Data_num.columns if Data_num[c].isna().all()]
if all_nan_cols:
    print('Drop all-NaN columns:', all_nan_cols)
    Data_num = Data_num.drop(columns=all_nan_cols)

# Drop columns containing any NaN
na_cols = Data_num.columns[Data_num.isna().any()].tolist()
if na_cols:
    print('Drop columns containing NaN:', na_cols)
    Data_num = Data_num.drop(columns=na_cols)

# Split features and labels (keep column 1:-1 as X, last column as y)
# If the first column is not an ID, change [1:-1] to [: -1]
X = Data_num.iloc[:, 1:-1].copy()
y = Data_num.iloc[:, -1].copy()

print('X shape:', X.shape)
print('y shape:', y.shape)
print('y unique:', y.unique())

# Drop columns with extreme values (>= 10000)
extreme_cols = [c for c in X.columns if pd.to_numeric(X[c], errors='coerce').max() >= 10000]
if extreme_cols:
    print('Drop extreme columns (>=10000):', extreme_cols)
    X = X.drop(columns=extreme_cols)

print('X shape after extreme filter:', X.shape)

# Variance threshold (remove features with 0 variance)
selector = VarianceThreshold()
X_var0 = selector.fit_transform(X)

# Get feature names after VarianceThreshold
try:
    list4 = selector.get_feature_names_out(input_features=X.columns)
except TypeError:
    support_idx = selector.get_support(indices=True)
    list4 = X.columns[support_idx].to_list()

X_var0 = pd.DataFrame(X_var0, columns=list4)
print('After VarianceThreshold:', X_var0.shape)

# Mutual information feature selection
mi = mutual_info_classif(X_var0, y)
k = int((mi > 0).sum())
k = max(k, 1)  # Keep at least one feature
mlc = SelectKBest(mutual_info_classif, k=k).fit(X_var0, y)
X_fsmic = mlc.transform(X_var0)

# Cross-validation using RandomForest
cr = cross_val_score(
    RandomForestClassifier(n_estimators=10, random_state=0),
    X_fsmic, y, cv=5
).mean()

print('k (mi>0):', k)
print('CV mean score:', cr)
print('X_fsmic shape:', X_fsmic.shape)

# Get selected feature names
try:
    list5 = mlc.get_feature_names_out(input_features=X_var0.columns)
except TypeError:
    support_idx2 = mlc.get_support(indices=True)
    list5 = X_var0.columns[support_idx2].to_list()

X_fsmic = pd.DataFrame(X_fsmic, columns=list5)

# Save selected features and labels
X_fsmic.to_csv(TARGET_FILE + '-2.csv')
X_ = pd.read_csv(TARGET_FILE + '-2.csv', index_col=0)

y.to_csv(TARGET_FILE + '-3.csv')
y_ = pd.read_csv(TARGET_FILE + '-3.csv', index_col=0).values.ravel()

print('X_ shape:', X_.shape, '| y_ shape:', y_.shape)

# Standardization
scaler = StandardScaler()
X_std = scaler.fit_transform(X_)
X_std = pd.DataFrame(X_std, columns=X_.columns)

# Train/test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X_std, y_, test_size=0.2, random_state=0, stratify=y_
)

print('Xtrain/Xtest/Ytrain/Ytest shapes:',
      Xtrain.shape, Xtest.shape, (len(Ytrain),), (len(Ytest),))

# Show class distribution
Ytrain = pd.Series(Ytrain)
Ytest = pd.Series(Ytest)
print('Train class counts:\n', Ytrain.value_counts())
print('Test class counts:\n', Ytest.value_counts())

# Apply SMOTE on training data
sm = SMOTE(random_state=42)
Xtrain_, Ytrain_ = sm.fit_resample(Xtrain, Ytrain)

print('After SMOTE n_samples:', Xtrain_.shape[0])
print('Resampled train class counts:\n', pd.Series(Ytrain_).value_counts())

# Save data splits
Xtrain.to_csv(TARGET_FILE + '-Xtrain.csv')
Xtrain_.to_csv(TARGET_FILE + '-Xtrain_.csv')
Xtest.to_csv(TARGET_FILE + '-Xtest.csv')
Ytrain.to_csv(TARGET_FILE + '-Ytrain.csv')
pd.Series(Ytrain_).to_csv(TARGET_FILE + '-Ytrain_.csv', index=False, header=True)
Ytest.to_csv(TARGET_FILE + '-Ytest.csv')

print('All done ✔')
