In [None]:
!pip install -U scikit-learn==1.3.2 imbalanced-learn==0.11.0 --quiet
!pip install --upgrade lightgbm


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/kaggle/input/engine-fault-detection-data/engine_fault_detection_dataset.csv")  
df.head()


In [None]:
# Insights
print(df.info())
print(df.describe())
print(df['Engine_Condition'].value_counts())

In [None]:
# Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()


In [None]:
# Class distribution
sns.countplot(x="Engine_Condition", data=df)
plt.title("Class Distribution (0=Normal, 1=Minor Fault, 2=Critical Fault)")
plt.show()


In [None]:
# Pairplot or scatter
sns.pairplot(df, hue="Engine_Condition", diag_kind="kde")


In [None]:
# 3D scatter (example with vibration vs temperature vs acoustic)
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Vibration_Amplitude'], df['Surface_Temperature'], 
           df['Acoustic_dB'], c=df['Engine_Condition'], cmap='viridis')
ax.set_xlabel('Vibration Amplitude')
ax.set_ylabel('Surface Temperature')
ax.set_zlabel('Acoustic dB')
plt.show()


Preprocessing

Scaling for SVM/NN.

Train-Test Split.

Balancing (SMOTE / class weights).

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop("Engine_Condition", axis=1)
y = df["Engine_Condition"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Baseline Model (Linear Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr = LogisticRegression(multi_class='multinomial', class_weight='balanced', max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")


Tree-Based Models

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Greens")

# Feature importance
feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
feat_imp.plot(kind='bar', figsize=(10,6), title="Feature Importance")


XGBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', scale_pos_weight=1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Oranges")


Model Training & Comparison

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Dictionary to store results
results = {}

# Logistic Regression
lr = LogisticRegression(multi_class='multinomial', class_weight='balanced', max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
results['Logistic Regression'] = {
    'Accuracy': accuracy_score(y_test, y_pred_lr),
    'Precision': precision_score(y_test, y_pred_lr, average='weighted'),
    'Recall': recall_score(y_test, y_pred_lr, average='weighted'),
    'F1-score': f1_score(y_test, y_pred_lr, average='weighted')
}

# Random Forest
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf, average='weighted'),
    'Recall': recall_score(y_test, y_pred_rf, average='weighted'),
    'F1-score': f1_score(y_test, y_pred_rf, average='weighted')
}

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
results['XGBoost'] = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'Precision': precision_score(y_test, y_pred_xgb, average='weighted'),
    'Recall': recall_score(y_test, y_pred_xgb, average='weighted'),
    'F1-score': f1_score(y_test, y_pred_xgb, average='weighted')
}

# Support Vector Machine
svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
results['SVM (RBF)'] = {
    'Accuracy': accuracy_score(y_test, y_pred_svm),
    'Precision': precision_score(y_test, y_pred_svm, average='weighted'),
    'Recall': recall_score(y_test, y_pred_svm, average='weighted'),
    'F1-score': f1_score(y_test, y_pred_svm, average='weighted')
}

# Convert results to DataFrame for comparison
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.sort_values(by="F1-score", ascending=False)
print(comparison_df)


Visualization of Comparison

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=comparison_df.index, y=comparison_df["F1-score"])
plt.title("Model Comparison (F1-score)")
plt.ylabel("F1-score")
plt.xticks(rotation=45)
plt.show()


1. Observations from Your Results

Random Forest (59.6% Accuracy) is currently the best.

XGBoost (55.9%) is slightly behind but has better balance between recall and F1.

SVM & Logistic Regression are performing poorly (expected due to non-linear dataset & imbalance).

Overall, accuracy < 60% → models are not learning patterns strongly enough.

2. Why Performance is Low

Imbalanced Dataset:

Normal (60%), Minor fault (30%), Critical fault (10%).

Models tend to predict the majority class ("Normal").

That’s why Recall and F1 are low for minority classes.

Complex Feature Interactions:

The engine fault patterns may not be linearly separable.

Random Forest helps, but feature tuning is needed.

Default Hyperparameters:

The models you used are with default settings.

They usually underperform until tuned.

A. Handle Class Imbalance

Use SMOTE (Synthetic Minority Oversampling Technique) for training set.

Or use class_weight='balanced' (you already used this in some models, but SMOTE often works better).

In [None]:
print(df.columns)


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Split features and target
X = df.drop('Engine_Condition', axis=1)  # Replace 'target' with the actual target column name
y = df['Engine_Condition']

# 2. Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 3. Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Apply SMOTE to training data only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

# Optional: Visualize class balance after SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x=y_train_res)
plt.title("Class Distribution After SMOTE")
plt.show()

# 5. Define models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM (RBF)": SVC(kernel='rbf', probability=True),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_rf.fit(X_train_res, y_train_res)

print("Best RF params:", grid_rf.best_params_)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Reuse models (re-define if necessary)
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models_after_smote = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    
    # Use best Random Forest parameters from your GridSearch
    "Random Forest": RandomForestClassifier(
        class_weight='balanced',
        max_depth=None,
        min_samples_leaf=1,
        min_samples_split=2,
        n_estimators=500,
        random_state=42
    ),
    
    "SVM (RBF)": SVC(kernel='rbf', probability=True),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Evaluate
results_after_smote = []

for name, model in models_after_smote.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test_scaled)
    
    results_after_smote.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1-score": f1_score(y_test, y_pred, average='weighted')
    })

# Show results
import pandas as pd
results_df_smote = pd.DataFrame(results_after_smote)
print(" Evaluation AFTER SMOTE:")
print(results_df_smote)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

for name, model in models_after_smote.items():
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n{name} Confusion Matrix:")
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"{name} Confusion Matrix After SMOTE")
    plt.show()


In [None]:
from imblearn.combine import SMOTEENN, SMOTETomek

smote_enn = SMOTEENN(random_state=42)
X_train_res2, y_train_res2 = smote_enn.fit_resample(X_train_scaled, y_train)

# Repeat model training using X_train_res2 and y_train_res2


In [None]:
from sklearn.metrics import classification_report

for name, model in models_after_smote.items():
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))


In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    class_weight='balanced',
    n_estimators=500,
    max_depth=10,
    learning_rate=0.05,
    random_state=42
)
lgbm.fit(X_train_res, y_train_res)
y_pred = lgbm.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=4))


In [None]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
X_train_res2, y_train_res2 = smote_tomek.fit_resample(X_train_scaled, y_train)

# Train best model again (e.g., Random Forest, LightGBM)


In [None]:
importances = grid_rf.best_estimator_.feature_importances_
feature_names = X.columns
plt.barh(feature_names, importances)
plt.title("Feature Importances - Random Forest")
plt.show()


In [None]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function='MultiClass',
    class_weights=[1, 2, 6],  # Tune this based on class distribution
    verbose=0,
    random_state=42
)

cat.fit(X_train_scaled, y_train)
y_pred = cat.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


In [None]:
df['binary_target'] = df['Engine_Condition'].apply(lambda x: 1 if x == 2 else 0)


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Assign class weights (you can tune these)
class_weights = [1.0, 2.0, 6.0]  # More weight to rare class

cat = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    class_weights=class_weights,
    random_state=42,
    verbose=100
)

cat.fit(X_train_scaled, y_train)
y_pred = cat.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


In [None]:
import shap

explainer = shap.TreeExplainer(cat)
shap_values = explainer.shap_values(X_train_scaled)

shap.summary_plot(shap_values, X_train_scaled, feature_names=X.columns)


In [None]:
df['binary_target'] = df['Engine_Condition'].apply(lambda x: 1 if x == 2 else 0)


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Prepare features and new binary target
X = df.drop(['Engine_Condition', 'binary_target'], axis=1)
y = df['binary_target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# CatBoost with class weights
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    class_weights=[1, 6],  # boost the importance of detecting class 1 (critical fault)
    verbose=100,
    random_state=42
)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluation
print(classification_report(y_test, y_pred))


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Encode labels and create categorical target
encoder = LabelEncoder()
y_enc = encoder.fit_transform(y_train_res)
y_cat = to_categorical(y_enc)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_res.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_res, y_cat, epochs=50, batch_size=64, validation_split=0.2)


In [None]:
from tensorflow.keras.utils import to_categorical

y_train_onehot = to_categorical(y_train, num_classes=3)
y_val_onehot = to_categorical(y_val, num_classes=3)


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train_onehot, epochs=50, batch_size=64, validation_data=(X_val, y_val_onehot), class_weight=class_weight_dict)


In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val), class_weight=class_weight_dict)


In [None]:
# Required libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split dataset (assuming X and y are your features and labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# XGBoost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

# For binary classification; if multiclass, adjust average parameter accordingly
average_type = 'binary'  # or 'weighted' or 'macro' if multiclass

# Random Forest metrics
precision_rf = precision_score(y_test, y_pred_rf, average=average_type)
recall_rf = recall_score(y_test, y_pred_rf, average=average_type)
f1_rf = f1_score(y_test, y_pred_rf, average=average_type)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", conf_matrix_rf)

# XGBoost metrics
precision_xgb = precision_score(y_test, y_pred_xgb, average=average_type)
recall_xgb = recall_score(y_test, y_pred_xgb, average=average_type)
f1_xgb = f1_score(y_test, y_pred_xgb, average=average_type)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", conf_matrix_xgb)

# If binary classification, ROC-AUC can be calculated as:
if average_type == 'binary':
    # Get probabilities for the positive class
    y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
    y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

    roc_auc_rf = roc_auc_score(y_test, y_prob_rf)
    roc_auc_xgb = roc_auc_score(y_test, y_prob_xgb)
    print(f"Random Forest ROC-AUC: {roc_auc_rf:.4f}")
    print(f"XGBoost ROC-AUC: {roc_auc_xgb:.4f}")


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)


In [None]:
# Calculate class distribution counts
from collections import Counter

counter = Counter(y_train)
number_of_majority = counter[0]  # Assuming '0' is the majority class label
number_of_minority = counter[1]  # Assuming '1' is the minority class label

print(f"Majority class count: {number_of_majority}")
print(f"Minority class count: {number_of_minority}")

# Now use scale_pos_weight in XGBoost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(scale_pos_weight=number_of_majority / number_of_minority, random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
import numpy as np
from collections import Counter

# Get counts for each class
class_counts = Counter(y_train)
total = sum(class_counts.values())

# Calculate class weights: total_samples / (num_classes * class_count)
num_classes = 3
class_weights = {cls: total / (num_classes * count) for cls, count in class_counts.items()}

# Create sample weights for each instance in y_train
sample_weights = np.array([class_weights[label] for label in y_train])

# Train model with sample weights
xgb_model = xgb.XGBClassifier(objective='multi:softprob', num_class=num_classes, random_state=42)
xgb_model.fit(X_train, y_train, sample_weight=sample_weights)


In [None]:
from sklearn.ensemble import RandomForestClassifier

class_weights = {0: 1, 1: 9, 2: 5}  # example weights, calculate based on your data

rf = RandomForestClassifier(class_weight=class_weights, random_state=42)
rf.fit(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Define class weights based on imbalance
class_weights = {0: 1, 1: 9, 2: 5}  # adjust weights based on your data distribution

# Initialize model with class weights
rf = RandomForestClassifier(class_weight=class_weights, random_state=42)

# Train
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from collections import Counter

counts = Counter(y_train)
total = sum(counts.values())
num_classes = len(counts)

weights = {cls: total / (num_classes * count) for cls, count in counts.items()}
print(weights)


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

# Check original distribution
print("Before SMOTE:", Counter(y_train))

# Apply SMOTE to balance dataset
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_train_res))

# Train with balanced data
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)

# Predict and evaluate
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'class_weight': [{0:1, 1:5}, {0:1, 1:10}, 'balanced']
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(rf, param_grid, scoring='f1', cv=3, n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

print("Best parameters:", grid_search.best_params_)
best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.preprocessing import label_binarize

# Assuming your data is in X and y, with 3 classes: 0, 1, 2

# 1. Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_res))

# 3. Random Forest with best hyperparameters
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_train_res, y_train_res)
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# 4. Calculate multiclass ROC-AUC (One-vs-Rest)
# Binarize the output labels for multi-class ROC AUC
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
roc_auc_rf = roc_auc_score(y_test_bin, y_proba_rf, multi_class='ovr')
print(f"Random Forest ROC-AUC (OvR): {roc_auc_rf:.4f}")

# 5. XGBoost with SMOTE data
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_model.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

roc_auc_xgb = roc_auc_score(y_test_bin, y_proba_xgb, multi_class='ovr')
print(f"XGBoost ROC-AUC (OvR): {roc_auc_xgb:.4f}")

# 6. Final comparison
print("\nFinal Model Comparison:")
print(f"Random Forest ROC-AUC: {roc_auc_rf:.4f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.4f}")


In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier

# Assume X, y are your dataset features and labels (binary classification: classes 0 and 1)

# 1. Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# 2. Apply SMOTEENN on training data
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

print("Before SMOTEENN:", Counter(y_train))
print("After SMOTEENN:", Counter(y_train_res))

# 3. Train Balanced Random Forest
brf = BalancedRandomForestClassifier(n_estimators=200, random_state=42)
brf.fit(X_train_res, y_train_res)

# 4. Predict probabilities on test set
y_proba = brf.predict_proba(X_test)

# 5. Tune threshold for minority classes (here for class 1)
optimal_thresholds = {}
for class_idx in np.unique(y_test):
    precision, recall, thresholds = precision_recall_curve((y_test == class_idx).astype(int), y_proba[:, class_idx])
    f1_scores = 2 * recall * precision / (recall + precision + 1e-6)
    best_idx = np.argmax(f1_scores)
    # If best_idx == len(thresholds), use 0.5 as fallback threshold
    optimal_thresholds[class_idx] = thresholds[best_idx] if best_idx < len(thresholds) else 0.5

print("Optimal thresholds per class:", optimal_thresholds)

# 6. Predict classes with tuned thresholds
y_pred_tuned = []
for prob in y_proba:
    class_preds = []
    for cls in range(len(optimal_thresholds)):
        class_preds.append(prob[cls] >= optimal_thresholds[cls])
    if sum(class_preds) == 1:
        y_pred_tuned.append(class_preds.index(True))
    else:
        # If multiple or none meet threshold, pick the class with max probability
        y_pred_tuned.append(np.argmax(prob))
y_pred_tuned = np.array(y_pred_tuned)

# 7. Evaluation
print("Balanced Random Forest Classification Report with Threshold Tuning:")
print(classification_report(y_test, y_pred_tuned))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))

# 8. Correct ROC-AUC for binary classification
roc_auc = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC-AUC: {roc_auc:.4f}")


In [None]:
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier
import lightgbm as lgb

# 1. Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Apply SMOTEENN on training data
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

print("Before SMOTEENN:", Counter(y_train))
print("After SMOTEENN:", Counter(y_train_res))

# --- Balanced Random Forest ---
brf = BalancedRandomForestClassifier(n_estimators=200, random_state=42)
brf.fit(X_train_res, y_train_res)
y_proba_brf = brf.predict_proba(X_test)

# Threshold tuning for BRF
optimal_thresholds_brf = {}
for class_idx in np.unique(y_test):
    precision, recall, thresholds = precision_recall_curve((y_test == class_idx).astype(int), y_proba_brf[:, class_idx])
    f1_scores = 2 * precision * recall / (precision + recall + 1e-6)
    best_idx = np.argmax(f1_scores)
    optimal_thresholds_brf[class_idx] = thresholds[best_idx] if best_idx < len(thresholds) else 0.5

# Predict with tuned threshold BRF
y_pred_brf = []
for prob in y_proba_brf:
    preds = [prob[cls] >= optimal_thresholds_brf[cls] for cls in range(len(optimal_thresholds_brf))]
    if sum(preds) == 1:
        y_pred_brf.append(preds.index(True))
    else:
        y_pred_brf.append(np.argmax(prob))
y_pred_brf = np.array(y_pred_brf)

print("\nBalanced Random Forest Classification Report:")
print(classification_report(y_test, y_pred_brf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_brf))
print(f"Balanced RF ROC-AUC: {roc_auc_score(y_test, y_proba_brf[:,1]):.4f}")

# --- LightGBM with class weights ---
# Calculate class weights inversely proportional to class frequency
class_weights = {cls: 1.0/count for cls, count in Counter(y_train).items()}
weights = np.array([class_weights[label] for label in y_train])

lgb_train = lgb.Dataset(X_train, label=y_train, weight=weights)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

lgb_model = lgb.train(params, lgb_train, num_boost_round=200)

# Predict probabilities on test set
y_proba_lgb = lgb_model.predict(X_test)

# Threshold tuning for LightGBM
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_lgb)
f1_scores = 2 * precision * recall / (precision + recall + 1e-6)
best_idx = np.argmax(f1_scores)
optimal_threshold_lgb = thresholds[best_idx] if best_idx < len(thresholds) else 0.5

y_pred_lgb = (y_proba_lgb >= optimal_threshold_lgb).astype(int)

print("\nLightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgb))
print(f"LightGBM ROC-AUC: {roc_auc_score(y_test, y_proba_lgb):.4f}")


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Assuming X_train, y_train, weights are pandas DataFrame/Series and numpy array

params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'metric': 'multi_logloss',
    'verbosity': -1,   # suppress logs
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'seed': 42
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models = []
best_iteration = 0

for train_idx, val_idx in skf.split(X_train, y_train):
    lgb_train_cv = lgb.Dataset(X_train.iloc[train_idx], 
                               label=y_train.iloc[train_idx], 
                               weight=weights[train_idx])
    lgb_val_cv = lgb.Dataset(X_train.iloc[val_idx], 
                             label=y_train.iloc[val_idx], 
                             weight=weights[val_idx])
    
    model = lgb.train(
        params,
        lgb_train_cv,
        valid_sets=[lgb_val_cv],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    
    best_iteration = max(best_iteration, model.best_iteration)
    models.append(model)

print(f"Best iteration across folds: {best_iteration}")


In [None]:
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,    # lowered learning rate
    'num_leaves': 50,         # increased complexity
    'seed': 42
}

num_boost_round = 500

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models = []
best_iteration = 0

for train_idx, val_idx in skf.split(X_train, y_train):
    lgb_train_cv = lgb.Dataset(X_train.iloc[train_idx], 
                               label=y_train.iloc[train_idx], 
                               weight=weights[train_idx])
    lgb_val_cv = lgb.Dataset(X_train.iloc[val_idx], 
                             label=y_train.iloc[val_idx], 
                             weight=weights[val_idx])
    
    model = lgb.train(
        params,
        lgb_train_cv,
        valid_sets=[lgb_val_cv],
        num_boost_round=num_boost_round,
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    
    best_iteration = max(best_iteration, model.best_iteration)
    models.append(model)

print(f"Best iteration across folds: {best_iteration}")


In [None]:
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,    # smaller learning rate
    'num_leaves': 100,        # more complex trees
    'max_depth': 10,          # deeper trees
    'seed': 42,
    'is_unbalance': True      # handle imbalance inside LightGBM
}

num_boost_round = 1000

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models = []
best_iteration = 0

for train_idx, val_idx in skf.split(X_train, y_train):
    lgb_train_cv = lgb.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx])
    lgb_val_cv = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])
    
    model = lgb.train(
        params,
        lgb_train_cv,
        valid_sets=[lgb_val_cv],
        num_boost_round=num_boost_round,
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    
    best_iteration = max(best_iteration, model.best_iteration)
    models.append(model)

print(f"Best iteration across folds: {best_iteration}")


In [None]:
final_model = lgb.train(
    params,
    lgb.Dataset(X_train, label=y_train),
    num_boost_round=48
)


In [None]:
import numpy as np
from collections import Counter
from imblearn.combine import SMOTEENN
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb

# Generate imbalanced classification data
X, y = make_classification(n_samples=10000, n_features=20, weights=[0.9, 0.1], random_state=42)

print("Original class distribution:", Counter(y))

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTEENN to balance training data
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

print("Before SMOTEENN:", Counter(y_train))
print("After SMOTEENN:", Counter(y_train_res))

# Create LightGBM datasets with free_raw_data=False for incremental training
lgb_train = lgb.Dataset(X_train_res, label=y_train_res, free_raw_data=False)
valid_data = lgb.Dataset(X_test, label=y_test, reference=lgb_train, free_raw_data=False)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'seed': 42
}

num_round = 100
early_stopping_rounds = 10

best_score = float('inf')
best_iter = 0
model = None
rounds_without_improve = 0

for i in range(1, num_round + 1):
    if model is None:
        model = lgb.train(params, lgb_train, num_boost_round=1, valid_sets=[valid_data])
    else:
        model = lgb.train(params, lgb_train, num_boost_round=1, valid_sets=[valid_data], init_model=model)

    eval_results = model.eval_valid()
    print(f"Iteration {i}, eval_valid output: {eval_results}")

    # Check if eval_results is non-empty and extract logloss
    if len(eval_results) > 0 and len(eval_results[0]) >= 3:
        eval_result = eval_results[0][2]
    else:
        print("Warning: No validation results returned, stopping early.")
        break

    print(f"Iteration {i}, valid logloss: {eval_result:.6f}")

    if eval_result < best_score:
        best_score = eval_result
        best_iter = i
        rounds_without_improve = 0
    else:
        rounds_without_improve += 1

    if rounds_without_improve >= early_stopping_rounds:
        print(f"Early stopping at iteration {i}")
        break

print(f"Best iteration: {best_iter}, Best valid logloss: {best_score:.6f}")

# Predict on test
y_proba = model.predict(X_test, num_iteration=best_iter)
y_pred = (y_proba >= 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC: {roc_auc:.4f}")


In [None]:
import lightgbm as lgb

# Ensure evaluation metric is explicitly set
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',  # Explicitly set!
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'seed': 42
}

# Build datasets
lgb_train = lgb.Dataset(X_train_res, label=y_train_res, free_raw_data=False)
valid_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

# Track eval results
evals_result = {}

# Train once with early stopping
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=[valid_data],
    valid_names=['valid'],
    evals_result=evals_result,
    early_stopping_rounds=10,
    verbose_eval=True  # set to False if you want to silence output
)

# Access best iteration
best_iter = model.best_iteration
best_logloss = evals_result['valid']['binary_logloss'][best_iter - 1]

print(f"\nBest iteration: {best_iter}")
print(f"Best validation logloss: {best_logloss:.6f}")
