In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Already included in RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # Import LDA

# Load your dataset
data = pd.read_csv('/content/dataset-long-method.csv')

# Handle missing values (optional)
data = data.dropna()  # Remove rows with missing values (consider imputation if necessary)

data = data.drop(columns=['column_to_drop'], errors='ignore')

# Select only numerical columns
numerical_data = data.select_dtypes(include=['int', 'float'])
data = numerical_data

# Separate features (X) and target variable (y)
y = data['severity']  # Assuming 'severity' is your target column
X = data.drop('severity', axis=1)  # Drop 'severity' from features

# Class distribution before BorderlineSMOTE (optional)
class_distribution_before = y.value_counts().sort_values(ascending=False)
print("Class Distribution Before BorderlineSMOTE:\n", class_distribution_before)

# Apply BorderlineSMOTE to balance classes (optional)
if class_distribution_before.std() > class_distribution_before.mean() * 0.1:  # Check for imbalance (standard deviation > 10% of mean)
    borderline_smote = BorderlineSMOTE()
    X_resampled, y_resampled = borderline_smote.fit_resample(X, y)

    # Class distribution after BorderlineSMOTE (optional)
    class_distribution_after = y_resampled.value_counts().sort_values(ascending=False)
    print("Class Distribution After BorderlineSMOTE:\n", class_distribution_after)
else:
    print("Data appears balanced. Skipping BorderlineSMOTE.")
    X_resampled = X
    y_resampled = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis()
X_train_reduced = lda.fit_transform(X_train, y_train)
X_test_reduced = lda.transform(X_test)

# Print the number of features remaining after LDA
num_features_after_lda = X_train_reduced.shape[1]
print(f"Number of features remaining after LDA: {num_features_after_lda}")

# Define and train models (using reduced features)
models = {
    "KNN (3 neighbors)": KNeighborsClassifier(n_neighbors=3),
    "KNN (5 neighbors)": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),  # Enable probability estimates for metrics
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)

    # Make predictions on the test set (using reduced features)
    y_pred = model.predict(X_test_reduced)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Classification report (precision, recall, F1-score for each class)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Confusion matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Class Distribution Before BorderlineSMOTE:
 severity
1.0    280
3.0     95
4.0     34
2.0     11
Name: count, dtype: int64
Class Distribution After BorderlineSMOTE:
 severity
1.0    280
4.0    280
3.0    280
2.0    280
Name: count, dtype: int64
Number of features remaining after LDA: 3
KNN (3 neighbors) Accuracy: 0.9821
KNN (3 neighbors) Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      0.99      0.99        70
         2.0       0.98      1.00      0.99        56
         3.0       0.96      0.96      0.96        51
         4.0       0.98      0.98      0.98        47

    accuracy                           0.98       224
   macro avg       0.98      0.98      0.98       224
weighted avg       0.98      0.98      0.98       224

KNN (3 neighbors) Confusion Matrix:
[[69  0  1  0]
 [ 0 56  0  0]
 [ 0  1 49  1]
 [ 0  0  1 46]]
KNN (5 neighbors) Accuracy: 0.9955
KNN (5 neighbors) Classification Report:
              precision    re

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient Boosting Accuracy: 0.9777
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      0.97      0.99        70
         2.0       0.98      0.98      0.98        56
         3.0       0.94      0.96      0.95        51
         4.0       0.98      1.00      0.99        47

    accuracy                           0.98       224
   macro avg       0.98      0.98      0.98       224
weighted avg       0.98      0.98      0.98       224

Gradient Boosting Confusion Matrix:
[[68  0  2  0]
 [ 0 55  1  0]
 [ 0  1 49  1]
 [ 0  0  0 47]]
SVM Accuracy: 0.9911
SVM Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        70
         2.0       0.98      0.98      0.98        56
         3.0       0.98      0.98      0.98        51
         4.0       1.00      1.00      1.00        47

    accuracy                           0.99       224
   macro avg      