In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN  # Change import to ADASYN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Already included in RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # Import LDA

# Load your dataset
data = pd.read_csv('/content/dataset-data-class.csv')

# Handle missing values (optional)
data = data.dropna()  # Remove rows with missing values (consider imputation if necessary)

data = data.drop(columns=['column_to_drop'], errors='ignore')

# Select only numerical columns
numerical_data = data.select_dtypes(include=['int', 'float'])
data = numerical_data

# Separate features (X) and target variable (y)
y = data['severity']  # Assuming 'severity' is your target column
X = data.drop('severity', axis=1)  # Drop 'severity' from features

# Class distribution before ADASYN (optional)
class_distribution_before = y.value_counts().sort_values(ascending=False)
print("Class Distribution Before ADASYN:\n", class_distribution_before)

# Apply ADASYN to balance classes (optional)
adasyn = ADASYN(sampling_strategy='minority', random_state=42)  # Instantiate ADASYN with default settings
X_resampled, y_resampled = adasyn.fit_resample(X, y)  # Use ADASYN for oversampling

# Class distribution after ADASYN (optional)
class_distribution_after = y_resampled.value_counts().sort_values(ascending=False)
print("Class Distribution After ADASYN:\n", class_distribution_after)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis()
X_train_reduced = lda.fit_transform(X_train, y_train)
X_test_reduced = lda.transform(X_test)

# Print the number of features remaining after LDA
num_features_after_lda = X_train_reduced.shape[1]
print(f"Number of features remaining after LDA: {num_features_after_lda}")

# Define and train models (using reduced features)
models = {
    "KNN (3 neighbors)": KNeighborsClassifier(n_neighbors=3),
    "KNN (5 neighbors)": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),  # Enable probability estimates for metrics
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)

    # Make predictions on the test set (using reduced features)
    y_pred = model.predict(X_test_reduced)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Classification report (precision, recall, F1-score for each class)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Confusion matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Class Distribution Before ADASYN:
 severity
1.0    151
4.0    124
3.0    113
2.0     32
Name: count, dtype: int64
Class Distribution After ADASYN:
 severity
1.0    151
2.0    148
4.0    124
3.0    113
Name: count, dtype: int64
Number of features remaining after LDA: 3
KNN (3 neighbors) Accuracy: 0.7870
KNN (3 neighbors) Classification Report:
              precision    recall  f1-score   support

         1.0       0.85      0.85      0.85        27
         2.0       0.77      0.94      0.85        32
         3.0       0.74      0.65      0.69        26
         4.0       0.79      0.65      0.71        23

    accuracy                           0.79       108
   macro avg       0.79      0.77      0.78       108
weighted avg       0.79      0.79      0.78       108

KNN (3 neighbors) Confusion Matrix:
[[23  3  1  0]
 [ 1 30  1  0]
 [ 1  4 17  4]
 [ 2  2  4 15]]
KNN (5 neighbors) Accuracy: 0.7315
KNN (5 neighbors) Classification Report:
              precision    recall  f1-score   s