In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Print current working directory and list files
print(f"Current working directory: {os.getcwd()}")
print(f"Files in directory: {os.listdir()}")

# Load the datasets
try:
    train_data = pd.read_csv("Content/Train_Data.csv")
    test_data = pd.read_csv("Content/Test_Data.csv")
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"Error: {e}")
    # You might want to stop execution here or provide alternative paths
    raise

# Separate features and target
X = train_data.drop(columns=["Sepssis"])
y = train_data["Sepssis"]  # 'Negative' or 'Positive'

# Convert target to binary (LightGBM needs numeric labels for training)
y = y.map({'Negative': 0, 'Positive': 1})

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Preprocessing for categorical data (LightGBM handles categories natively, but we impute missing)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing"))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

# Full pipeline with preprocessing and LightGBM
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("lgbm", LGBMClassifier(random_state=42, objective="binary"))
])

# Define parameter grid for tuning
param_grid = {
    "lgbm__n_estimators": [100, 200],
    "lgbm__learning_rate": [0.01, 0.1],
    "lgbm__max_depth": [3, 5, -1],  # -1 means no limit
    "lgbm__num_leaves": [31, 50]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="f1", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train, lgbm__categorical_feature=categorical_cols.tolist())

# Best model
best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
y_val_pred = best_model.predict(X_val)
f1 = f1_score(y_val, y_val_pred)
print(f"F1 Score on validation set: {f1:.4f}")

# Train on full dataset
best_model.fit(X, y, lgbm__categorical_feature=categorical_cols.tolist())

# Predict on test set
test_predictions = best_model.predict(test_data)

# Convert predictions back to 'Negative'/'Positive' for submission
test_predictions = np.where(test_predictions == 0, 'Negative', 'Positive')

# Prepare submission
submission_df = pd.DataFrame({"Sepssis": test_predictions})
submission_df.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created successfully!")

Current working directory: c:\Users\ssnay\Documents\GitHub\Hack-MITWPU-Syntax_Terror-sepsis_Detection
Files in directory: ['.git', '.gitignore', 'Best__1_21.ipynb', 'Content', 'LICENSE', 'README.md', 'submission.csv']
Files loaded successfully!
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[LightGBM] [Info] Number of positive: 40078, number of negative: 39922
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500975 -> initscore=0.003900
[LightGBM] [Info] Start training from score 0.003900
Best parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': -1, 'lgbm__n_estimators': 200, 'lgbm__num_leaves': 50}
Best cross-validat



[LightGBM] [Info] Number of positive: 50000, number of negative: 50000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 100000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Submission file 'submission.csv' created successfully!


