In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier, KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Already included in RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # Import LDA

# Load your dataset
data = pd.read_csv('/content/dataset-feature-envy.csv')

# Handle missing values (optional)
data = data.dropna()  # Remove rows with missing values (consider imputation if necessary)

data = data.drop(columns=['column_to_drop'], errors='ignore')

# Select only numerical columns
numerical_data = data.select_dtypes(include=['int', 'float'])
data = numerical_data

# Separate features (X) and target variable (y)
y = data['severity']  # Assuming 'severity' is your target column
X = data.drop('severity', axis=1)  # Drop 'severity' from features

# Class distribution before SMOTE (optional)
class_distribution_before = y.value_counts().sort_values(ascending=False)
print("Class Distribution Before SMOTE:\n", class_distribution_before)

# Apply SMOTE to balance classes (optional)
if class_distribution_before.std() > class_distribution_before.mean() * 0.1:  # Check for imbalance (standard deviation > 10% of mean)
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Class distribution after SMOTE (optional)
    class_distribution_after = y_resampled.value_counts().sort_values(ascending=False)
    print("Class Distribution After SMOTE:\n", class_distribution_after)
else:
    print("Data appears balanced. Skipping SMOTE.")
    X_resampled = X
    y_resampled = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis()
X_train_reduced = lda.fit_transform(X_train, y_train)
X_test_reduced = lda.transform(X_test)

# Print the number of features remaining after LDA
num_features_after_lda = X_train_reduced.shape[1]
print(f"Number of features remaining after LDA: {num_features_after_lda}")

# Define and train models (using reduced features)
models = {
    "KNN (3 neighbors)": KNeighborsClassifier(n_neighbors=3),
    "KNN (5 neighbors)": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),  # Enable probability estimates for metrics
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)

    # Make predictions on the test set (using reduced features)
    y_pred = model.predict(X_test_reduced)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Classification report (precision, recall, F1-score for each class)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Confusion matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Class Distribution Before SMOTE:
 severity
1    280
3     95
2     23
4     22
Name: count, dtype: int64
Class Distribution After SMOTE:
 severity
1    280
2    280
3    280
4    280
Name: count, dtype: int64
Number of features remaining after LDA: 3
KNN (3 neighbors) Accuracy: 0.9643
KNN (3 neighbors) Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.95      0.96        58
           2       0.93      0.95      0.94        57
           3       0.98      0.97      0.98        62
           4       0.98      1.00      0.99        47

    accuracy                           0.96       224
   macro avg       0.96      0.97      0.97       224
weighted avg       0.96      0.96      0.96       224

KNN (3 neighbors) Confusion Matrix:
[[55  3  0  0]
 [ 2 54  1  0]
 [ 0  1 60  1]
 [ 0  0  0 47]]
KNN (5 neighbors) Accuracy: 0.9554
KNN (5 neighbors) Classification Report:
              precision    recall  f1-score   support

          