In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset
file_path = "D:\Sem 5\projects\FDS\liver_cirrhosis.csv"
data = pd.read_csv(file_path)
# Function to remove outliers using IQR
def remove_outliers_iqr(data, columns):
    for column in columns:
        Q1 = data[column].quantile(0.25)  # 25th percentile
        Q3 = data[column].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1                   # Interquartile range
        lower_bound = Q1 - 1.5 * IQR    # Lower bound
        upper_bound = Q3 + 1.5 * IQR    # Upper bound
        
        # Filter the data
        data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data

# Apply the function to numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Save the shape before removing outliers
shape_before = data.shape

# Remove outliers
data = remove_outliers_iqr(data, numeric_columns)

# Print the shapes
print(f"Shape before removing outliers: {shape_before}")
print(f"Shape after removing outliers: {data.shape}")

Shape before removing outliers: (9639, 19)
Shape after removing outliers: (5636, 19)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb

# Load the dataset
file_path = "D:\\Sem 5\\projects\\FDS\\liver_cirrhosis.csv"
data = pd.read_csv(file_path)

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'stage'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {name} | Accuracy: {accuracy:.4f}")
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("="*40)

# Define the models (including XGBoost, LightGBM, CatBoost, and BaggingClassifier)
models = [
    ("Decision Tree", Pipeline([('scaler', scaler), ('classifier', DecisionTreeClassifier(class_weight='balanced'))])),
    ("Random Forest", Pipeline([('scaler', scaler), ('classifier', RandomForestClassifier(random_state=42))])),
    ("Bagging Classifier", Pipeline([('scaler', scaler), ('classifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=42))])),
    ("Gradient Boosting", Pipeline([('scaler', scaler), ('classifier', GradientBoostingClassifier(random_state=42))])),
    ("Support Vector Classifier", Pipeline([('scaler', scaler), ('classifier', SVC(probability=True, random_state=42))])),
    ("K-Nearest Neighbors", Pipeline([('scaler', scaler), ('classifier', KNeighborsClassifier())])),
    ("Naive Bayes", Pipeline([('scaler', scaler), ('classifier', GaussianNB())])),
    ("Neural Network", Pipeline([('scaler', scaler), ('classifier', MLPClassifier(random_state=42, max_iter=1000))])),
    ("LightGBM", Pipeline([('scaler', scaler), ('classifier', lgb.LGBMClassifier(objective='multiclass', random_state=42))])),
    ("CatBoost", Pipeline([('scaler', scaler), ('classifier', cb.CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, random_state=42, verbose=0))])),
]

# Loop through models and evaluate
for name, model in models:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Decision Tree
              precision    recall  f1-score   support

           1       0.73      0.73      0.73       617
           2       0.69      0.70      0.70       640
           3       0.82      0.82      0.82       671

    accuracy                           0.75      1928
   macro avg       0.75      0.75      0.75      1928
weighted avg       0.75      0.75      0.75      1928

Model: Decision Tree | Accuracy: 0.7505
ROC AUC Score: 0.8172106191346468
Confusion Matrix:
[[448 123  46]
 [119 448  73]
 [ 44  76 551]]
Model: Random Forest
              precision    recall  f1-score   support

           1       0.86      0.85      0.86       617
           2       0.84      0.84      0.84       640
           3       0.91      0.91      0.91       671

    accuracy                           0.87      1928
   macro avg       0.87      0.87      0.87      1928
weighted avg       0.87      0.87      0.87      1928

Model: Random Forest | Accuracy: 0.8703
ROC AUC Score: 0.9



Model: Bagging Classifier
              precision    recall  f1-score   support

           1       0.80      0.84      0.82       617
           2       0.81      0.77      0.79       640
           3       0.89      0.89      0.89       671

    accuracy                           0.84      1928
   macro avg       0.83      0.83      0.83      1928
weighted avg       0.84      0.84      0.83      1928

Model: Bagging Classifier | Accuracy: 0.8351
ROC AUC Score: 0.9400717001017549
Confusion Matrix:
[[521  73  23]
 [ 98 495  47]
 [ 32  45 594]]
Model: Gradient Boosting
              precision    recall  f1-score   support

           1       0.79      0.79      0.79       617
           2       0.74      0.75      0.75       640
           3       0.86      0.84      0.85       671

    accuracy                           0.79      1928
   macro avg       0.79      0.79      0.79      1928
weighted avg       0.80      0.79      0.79      1928

Model: Gradient Boosting | Accuracy: 0.7946
