In [None]:
%pip install lightgbm

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# ============================
# 1) LOAD & PREP DATA
# ============================
# Load dataset
df = pd.read_csv("cosmicclassifierTraining.csv")
target_col = "Prediction"
features = [col for col in df.columns if col != target_col]

# Separate features and target
X = df[features].copy()
y = df[target_col].copy()

# One-hot encode categorical columns (if any)
cat_cols = X.select_dtypes(include=["object"]).columns
if len(cat_cols) > 0:
    X = pd.get_dummies(X, columns=cat_cols)

# Impute missing values with mean strategy
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Encode target variable if needed
le = LabelEncoder()
y = le.fit_transform(y)

# Split data into training and validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(
    X_imputed, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE to balance classes on training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Train shape:", X_train.shape, "Validation shape:", X_val.shape)

# ============================
# 2) DEFINE OPTUNA OBJECTIVE FUNCTION (BAYESIAN OPTIMIZATION)
# ============================
def objective(trial):
    # Define hyperparameter search space for LightGBM
    param = {
        "objective": "multiclass",
        "num_class": len(np.unique(y)),
        "boosting_type": "gbdt",
        "metric": "multi_logloss",
        "verbosity": -1,
        "seed": 42,
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150, step=10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50, step=5),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0, step=0.1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0, step=0.1)
    }
    
    model = lgb.LGBMClassifier(**param)
    
    # Use 3-fold cross-validation on the training set
    score = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy", n_jobs=-1).mean()
    return score

# ============================
# 3) RUN OPTUNA STUDY
# ============================
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("\nBest trial:")
print("  Accuracy: {:.4f}".format(study.best_trial.value))
print("  Params: ")
for key, value in study.best_trial.params.items():
    print("    {}: {}".format(key, value))

best_params = study.best_trial.params

# ============================
# 4) TRAIN FINAL LIGHTGBM MODEL WITH BEST HYPERPARAMETERS
# ============================
final_params = {
    "objective": "multiclass",
    "num_class": len(np.unique(y)),
    "boosting_type": "gbdt",
    "metric": "multi_logloss",
    "verbosity": -1,
    "seed": 42
}
final_params.update(best_params)

final_model = lgb.LGBMClassifier(**final_params)
final_model.fit(X_train, y_train)

# ============================
# 5) EVALUATE THE FINAL MODEL ON THE VALIDATION SET
# ============================
y_val_pred = final_model.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred, average="macro")
rec = recall_score(y_val, y_val_pred, average="macro")
f1 = f1_score(y_val, y_val_pred, average="macro")

print("\nFinal Validation Metrics:")
print("Accuracy:  {:.4f}".format(acc))
print("Precision: {:.4f}".format(prec))
print("Recall:    {:.4f}".format(rec))
print("F1 Score:  {:.4f}".format(f1))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


In [None]:
import pickle 

with open("lightgbm1_model.pkl",'wb') as f:
    pickle.dump(final_model,f)