In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import KMeansSMOTE  # Import KMeansSMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Already included in RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # Import LDA

# Load your dataset
data = pd.read_csv('/content/dataset-god-class.csv')

# Handle missing values (optional)
data = data.dropna()  # Remove rows with missing values (consider imputation if necessary)

data = data.drop(columns=['column_to_drop'], errors='ignore')

# Select only numerical columns
numerical_data = data.select_dtypes(include=['int', 'float'])
data = numerical_data

# Separate features (X) and target variable (y)
y = data['severity']  # Assuming 'severity' is your target column
X = data.drop('severity', axis=1)  # Drop 'severity' from features

# Class distribution before KMeansSMOTE (optional)
class_distribution_before = y.value_counts().sort_values(ascending=False)
print("Class Distribution Before KMeansSMOTE:\n", class_distribution_before)

# Apply KMeansSMOTE to balance classes with sampling_strategy='all'
kmeans_smote = KMeansSMOTE(sampling_strategy='all',
                           cluster_balance_threshold=0.05,  # Lower cluster balance threshold
                           k_neighbors=5,  # Number of nearest neighbors for SMOTE
                           random_state=42)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)

# Class distribution after KMeansSMOTE (optional)
class_distribution_after = y_resampled.value_counts().sort_values(ascending=False)
print("Class Distribution After KMeansSMOTE:\n", class_distribution_after)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis()
X_train_reduced = lda.fit_transform(X_train, y_train)
X_test_reduced = lda.transform(X_test)

# Print the number of features remaining after LDA
num_features_after_lda = X_train_reduced.shape[1]
print(f"Number of features remaining after LDA: {num_features_after_lda}")

# Define and train models (using reduced features)
models = {
    "KNN (3 neighbors)": KNeighborsClassifier(n_neighbors=3),
    "KNN (5 neighbors)": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),  # Enable probability estimates for metrics
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)

    # Make predictions on the test set (using reduced features)
    y_pred = model.predict(X_test_reduced)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Classification report (precision, recall, F1-score for each class)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Confusion matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Class Distribution Before KMeansSMOTE:
 severity
1.0    154
4.0    127
3.0    110
2.0     29
Name: count, dtype: int64




Class Distribution After KMeansSMOTE:
 severity
4.0    160
3.0    159
2.0    155
1.0    154
Name: count, dtype: int64
Number of features remaining after LDA: 3
KNN (3 neighbors) Accuracy: 0.7381
KNN (3 neighbors) Classification Report:
              precision    recall  f1-score   support

         1.0       0.71      0.85      0.77        20
         2.0       0.69      0.92      0.79        26
         3.0       0.74      0.67      0.70        42
         4.0       0.83      0.63      0.72        38

    accuracy                           0.74       126
   macro avg       0.74      0.77      0.74       126
weighted avg       0.75      0.74      0.73       126

KNN (3 neighbors) Confusion Matrix:
[[17  2  1  0]
 [ 2 24  0  0]
 [ 2  7 28  5]
 [ 3  2  9 24]]
KNN (5 neighbors) Accuracy: 0.7302
KNN (5 neighbors) Classification Report:
              precision    recall  f1-score   support

         1.0       0.89      0.85      0.87        20
         2.0       0.70      0.88      0.78    