In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
data1 = pd.read_csv('../data/Data.csv')
labels1 = pd.read_csv('../data/Label.csv')

combined_data = pd.concat([data1, labels1], axis=1)

# Filter rows where 'Label' is 0
filtered_data = combined_data[combined_data['Label'] > 0]

# Extract filtered data and labels
filtered_data_only = filtered_data.drop(columns=['Label'])
filtered_labels_only = filtered_data['Label']

print("Filtered data shape:", filtered_data_only.shape)
print("Filtered labels shape:", filtered_labels_only.shape)

Filtered data shape: (89583, 76)
Filtered labels shape: (89583,)


In [19]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(filtered_data_only, filtered_labels_only, test_size=0.2, random_state=42)

In [20]:
print("Training data shape:", X_train1.shape, y_train1.shape)
print("Testing data shape:", X_test1.shape, y_test1.shape)

Training data shape: (71666, 76) (71666,)
Testing data shape: (17917, 76) (17917,)


In [21]:
y_train1 = y_train1 - 1
y_test1 = y_test1 - 1
print(np.unique(y_train1))
print(np.unique(y_test1))

[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8]


In [31]:
lda_optimal = LinearDiscriminantAnalysis(n_components=7)

In [32]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train1)
X_test_scaled = scaler.transform(X_test1)

In [33]:
X_train_lda = lda_optimal.fit_transform(X_train_scaled, y_train1)
X_test_lda = lda_optimal.transform(X_test_scaled)

[[ 0.47245986  0.09093941  0.03396793 ...  0.98225703 -2.4754416
  -0.40803077]
 [-0.55189893  1.25000096  0.44722845 ...  1.02722269 -1.41792523
  -0.56734315]
 [-3.29327971  0.20417881  0.84389948 ... -9.14295484 -2.54882946
  -1.58072163]
 ...
 [ 1.71603543 -0.13257668  1.57037209 ... -0.3571975   0.63562598
  -1.36186746]
 [ 0.80917484 -0.77445565 -0.24805557 ...  1.14202442  0.65608288
   2.19585137]
 [ 0.44357636  0.25462376  0.3715815  ... -0.41373443 -0.12078115
   0.11201033]]


In [34]:
print(f"Transformed training shape: {X_train_lda.shape}")
print(f"Transformed test shape: {X_test_lda.shape}")

Transformed training shape: (71666, 7)
Transformed test shape: (17917, 7)


In [35]:
xgb_classifier = xgb.XGBClassifier(random_state=42)

In [36]:
xgb_classifier.fit(X_train_lda, y_train1)

In [37]:
y_pred1 = xgb_classifier.predict(X_test_lda)

In [38]:
print("\nClassification Report:")
report = classification_report(y_test1, y_pred1, target_names=[f"Class {i}" for i in np.unique(y_test1)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.43      0.29      0.35        68
     Class 1       0.67      0.26      0.37        77
     Class 2       0.67      0.17      0.27       917
     Class 3       0.76      0.75      0.76      6191
     Class 4       0.65      0.91      0.76      5859
     Class 5       0.81      0.58      0.68       929
     Class 6       0.88      0.65      0.75      3370
     Class 7       0.50      0.17      0.25       457
     Class 8       0.25      0.06      0.10        49

    accuracy                           0.73     17917
   macro avg       0.62      0.43      0.48     17917
weighted avg       0.74      0.73      0.71     17917



using random forest

In [39]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_lda, y_train1)

# Predict on the test data
y_pred = rf_classifier.predict(X_test_lda)

In [40]:
print("\nClassification Report:")
report = classification_report(y_test1, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test1)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.46      0.26      0.34        68
     Class 1       0.53      0.26      0.35        77
     Class 2       0.55      0.22      0.31       917
     Class 3       0.73      0.78      0.75      6191
     Class 4       0.70      0.82      0.75      5859
     Class 5       0.81      0.62      0.70       929
     Class 6       0.75      0.67      0.71      3370
     Class 7       0.46      0.22      0.30       457
     Class 8       0.24      0.08      0.12        49

    accuracy                           0.71     17917
   macro avg       0.58      0.44      0.48     17917
weighted avg       0.71      0.71      0.70     17917

