In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pyloras import LORAS # Import LORA
from sklearn.neighbors import KNeighborsClassifier, KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Already included in RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your dataset
data = pd.read_csv('/content/dataset-data-class.csv')

# Handle missing values (optional)
data = data.dropna()  # Remove rows with missing values (consider imputation if necessary)

data = data.drop(columns=['column_to_drop'], errors='ignore')

# Select only numerical columns
numerical_data = data.select_dtypes(include=['int', 'float'])
data = numerical_data

# Separate features (X) and target variable (y)
y = data['severity']  # Assuming 'severity' is your target column
X = data.drop('severity', axis=1)  # Drop 'severity' from features

# Class distribution before SMOTE (optional)
class_distribution_before = y.value_counts().sort_values(ascending=False)
print("Class Distribution Before Oversampling:\n", class_distribution_before)

# Apply LORAS for oversampling (assuming imbalanced data)
oversample = LORAS()
X_resampled, y_resampled = oversample.fit_resample(X, y)

# Class distribution after oversampling (optional)
class_distribution_after = y_resampled.value_counts().sort_values(ascending=False)
print("Class Distribution After Oversampling:\n", class_distribution_after)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

lda = LinearDiscriminantAnalysis()
X_train_reduced = lda.fit_transform(X_train, y_train)
X_test_reduced = lda.transform(X_test)

# Print the number of features remaining after LDA
num_features_after_lda = X_train_reduced.shape[1]
print(f"Number of features remaining after LDA: {num_features_after_lda}")

# Define and train models
models = {
    "KNN (3 neighbors)": KNeighborsClassifier(n_neighbors=3),
    "KNN (5 neighbors)": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),  # Enable probability estimates for metrics
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
}

for name, model in models.items():
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Classification report (precision, recall, F1-score for each class)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Confusion matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Class Distribution Before Oversampling:
 severity
1.0    151
4.0    124
3.0    113
2.0     32
Name: count, dtype: int64
Class Distribution After Oversampling:
 severity
1.0    151
2.0    151
4.0    151
3.0    151
Name: count, dtype: int64
Number of features remaining after LDA: 3
KNN (3 neighbors) Accuracy: 0.6364
KNN (3 neighbors) Classification Report:
              precision    recall  f1-score   support

         1.0       0.54      0.70      0.61        30
         2.0       0.70      0.84      0.76        25
         3.0       0.58      0.56      0.57        32
         4.0       0.81      0.50      0.62        34

    accuracy                           0.64       121
   macro avg       0.66      0.65      0.64       121
weighted avg       0.66      0.64      0.63       121

KNN (3 neighbors) Confusion Matrix:
[[21  5  3  1]
 [ 2 21  1  1]
 [ 9  3 18  2]
 [ 7  1  9 17]]
KNN (5 neighbors) Accuracy: 0.6033
KNN (5 neighbors) Classification Report:
              precision    recall  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
! pip install -U pyloras

Collecting pyloras
  Downloading pyloras-0.1.0b6-py3-none-any.whl (13 kB)
Installing collected packages: pyloras
Successfully installed pyloras-0.1.0b6
