In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import f1_score

In [2]:
# --- A. LOAD DATA AND SPLIT ---
try:
    df_train = pd.read_csv('data/train_processed.csv')
    df_test = pd.read_csv('data/test_processed.csv')
    test_ids = pd.read_csv('data/ids.csv')
except FileNotFoundError:
    print("FATAL ERROR: Could not find files. Please check file paths ('data/').")
    raise

# Define X/y
X_train_full = df_train.drop(['has_copd_risk'], axis=1, errors='ignore')
y_train_full = df_train['has_copd_risk']

# Split for Preprocessor fitting (80% Train, 20% Val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

# --- B. PREPROCESSING PIPELINE (Fit/Transform) ---
all_features = X_train.columns.tolist()

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[('num', numerical_pipeline, all_features)],
    remainder='passthrough'
)
X_train_processed = preprocessor.fit_transform(X_train)

# ----------------------------------------------------------------------
# C. TRAIN MODEL, PREDICT, AND SAVE SUBMISSION
# ----------------------------------------------------------------------

# 1. Train LightGBM Model
lgbm_model = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=300,
    random_state=42,
    is_unbalance=True # Key for Imbalance
)
lgbm_model.fit(X_train_processed, y_train)
print("✅ LightGBM Model trained successfully.")

# 2. Prepare Test Features and Predict
X_test_full = df_test.drop('patient_id', axis=1, errors='ignore')
X_test_full = X_test_full[all_features] # Ensure column order

# Apply Preprocessor to Test Data (Transform only!)
X_test_processed = preprocessor.transform(X_test_full)

# Prediction
y_test_pred = lgbm_model.predict(X_test_processed)

# 3. Combine IDs and Predictions and Save
submission_df = test_ids.copy()
submission_df['has_copd_risk'] = y_test_pred.astype(int)

submission_df.to_csv('submission/lightgbm.csv', index=False)
print("\n✅ LightGBM submission file saved to 'submission/lightgbm.csv'")
print("\nFirst 5 predictions in submission:")
print(submission_df.head())

[LightGBM] [Info] Number of positive: 13074, number of negative: 22568
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2375
[LightGBM] [Info] Number of data points in the train set: 35642, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.366814 -> initscore=-0.545907
[LightGBM] [Info] Start training from score -0.545907
✅ LightGBM Model trained successfully.

✅ LightGBM submission file saved to 'submission/lightgbm.csv'

First 5 predictions in submission:
   patient_id  has_copd_risk
0       42427              0
1       27412              0
2       19283              1
3       45261              1
4       11155              1


