In [23]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [24]:
# Cell 2: Load and preprocess the dataset
df = pd.read_csv('combined_data.csv')

# Replace infinite values with NaN and drop rows with NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()

# Drop rows with missing target values
df = df.dropna(subset=['Cell_Type'])

# Encode target variable
label_encoder = LabelEncoder()
df['Cell_Type'] = label_encoder.fit_transform(df['Cell_Type'])

# Split the data into features and target variable
X = df.drop('Cell_Type', axis=1)
y = df['Cell_Type']  # Target is encoded

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (optional for some models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Verify the separation
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 10160
Testing set size: 2541


In [25]:
# Cell 3: Create and train the Stacking model with Logistic Regression as final estimator
# Define the base models
base_models = [
    ('catboost', CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, verbose=0)),
    ('xgboost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('lgbm', LGBMClassifier())
]

# Define the final estimator as Logistic Regression
final_estimator = LogisticRegression(max_iter=1000)

# Create the stacking model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=5)

# Train the stacking model
stacking_model.fit(X_train, y_train)



Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 5841, number of negative: 4319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 10160, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.574902 -> initscore=0.301878
[LightGBM] [Info] Start training from score 0.301878


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 4672, number of negative: 3456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 8128, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.574803 -> initscore=0.301475
[LightGBM] [Info] Start training from score 0.301475
[LightGBM] [Info] Number of positive: 4673, number of negative: 3455
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 8128, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.574926 -> initscore=0.301979
[LightGBM] [Info] Start training from score 0.301979
[LightGBM] [Info] Numb

In [26]:
# Cell 4: Make predictions and evaluate the stacking model
y_pred = stacking_model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1063   17]
 [  12 1449]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1080
           1       0.99      0.99      0.99      1461

    accuracy                           0.99      2541
   macro avg       0.99      0.99      0.99      2541
weighted avg       0.99      0.99      0.99      2541



In [27]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9886
