In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your dataset
data = pd.read_csv('/content/dataset-god-class.csv')

# Handle missing values (optional)
data = data.dropna()

data = data.drop(columns=['column_to_drop'], errors='ignore')

# Select only numerical columns
numerical_data = data.select_dtypes(include=['int', 'float'])
data = numerical_data

# Separate features (X) and target variable (y)
y = data['severity']  # Assuming 'severity' is your target column
X = data.drop('severity', axis=1)  # Drop 'severity' from features

# Class distribution before random oversampling (optional)
class_distribution_before = y.value_counts().sort_values(ascending=False)
print("Class Distribution Before Oversampling:\n", class_distribution_before)

# Calculate the target number of samples per class
target_samples_per_class = len(y) // len(y.unique())

# Apply random oversampling with specified sampling strategy
X_resampled = []
y_resampled = []
for class_label in y.unique():
    X_class = X[y == class_label]
    y_class = y[y == class_label]
    X_resampled_class, y_resampled_class = resample(X_class, y_class,
                                                    n_samples=target_samples_per_class,
                                                    random_state=42)
    X_resampled.append(X_resampled_class)
    y_resampled.append(y_resampled_class)

X_resampled = pd.concat(X_resampled)
y_resampled = pd.concat(y_resampled)

# Class distribution after oversampling (optional)
class_distribution_after = y_resampled.value_counts().sort_values(ascending=False)
print("Class Distribution After Oversampling:\n", class_distribution_after)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply Linear Discriminant Analysis (LDA)
lda = LinearDiscriminantAnalysis()
X_train_reduced = lda.fit_transform(X_train, y_train)
X_test_reduced = lda.transform(X_test)

# Print the number of features remaining after LDA
num_features_after_lda = X_train_reduced.shape[1]
print(f"Number of features remaining after LDA: {num_features_after_lda}")

# Define and train models
models = {
    "KNN (3 neighbors)": KNeighborsClassifier(n_neighbors=3),
    "KNN (5 neighbors)": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),  # Enable probability estimates for metrics
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)  # Use reduced feature set after LDA

    # Make predictions on the test set
    y_pred = model.predict(X_test_reduced)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Classification report (precision, recall, F1-score for each class)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Confusion matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Class Distribution Before Oversampling:
 severity
1.0    154
4.0    127
3.0    110
2.0     29
Name: count, dtype: int64
Class Distribution After Oversampling:
 severity
1.0    105
2.0    105
3.0    105
4.0    105
Name: count, dtype: int64
Number of features remaining after LDA: 3
KNN (3 neighbors) Accuracy: 0.7143
KNN (3 neighbors) Classification Report:
              precision    recall  f1-score   support

         1.0       0.95      0.72      0.82        29
         2.0       0.52      1.00      0.68        16
         3.0       0.62      0.62      0.62        16
         4.0       0.87      0.57      0.68        23

    accuracy                           0.71        84
   macro avg       0.74      0.73      0.70        84
weighted avg       0.78      0.71      0.72        84

KNN (3 neighbors) Confusion Matrix:
[[21  7  0  1]
 [ 0 16  0  0]
 [ 1  4 10  1]
 [ 0  4  6 13]]
KNN (5 neighbors) Accuracy: 0.7143
KNN (5 neighbors) Classification Report:
              precision    recall  