In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter

# ============================================================
# Load data
# ============================================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Category counts:\n", train['category'].value_counts())

# ============================================================
# Prepare features
# ============================================================
X = train[['signal_strength', 'response_level']]
y = train['category']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Polynomial features (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_encoded, test_size=0.20, random_state=0, stratify=y_encoded
)

# SMOTE oversampling
print("\nOriginal distribution:", Counter(y_train))
smote = SMOTE(random_state=0)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Resampled distribution:", Counter(y_train_res))

# ============================================================
# 1. SVM with multi-kernel GridSearch
# ============================================================
svm_params = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 20, 50]},
    {'kernel': ['poly'], 'C': [1, 10, 20], 'degree': [2, 3, 4], 'gamma': ['scale', 0.01, 0.001]},
    {'kernel': ['rbf'], 'C': [1, 10, 20], 'gamma': ['scale', 0.01, 0.001]}
]

grid_svm = GridSearchCV(
    SVC(probability=True, random_state=0),
    svm_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print("\nTraining SVM with multi-kernel search...")
grid_svm.fit(X_train_res, y_train_res)
svm = grid_svm.best_estimator_

y_pred_svm = svm.predict(X_val)
acc_svm = accuracy_score(y_val, y_pred_svm)

print("Best SVM params:", grid_svm.best_params_)
print("\nSVM Accuracy:", acc_svm)
print(classification_report(y_val, y_pred_svm, target_names=le.classes_))

# ============================================================
# 2. Improved MLP
# ============================================================
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='tanh',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.0005,
    max_iter=700,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=20,
    random_state=0
)

print("\nTraining MLP...")
mlp.fit(X_train_res, y_train_res)
y_pred_mlp = mlp.predict(X_val)
acc_mlp = accuracy_score(y_val, y_pred_mlp)

print("\nMLP Accuracy:", acc_mlp)
print(classification_report(y_val, y_pred_mlp, target_names=le.classes_))

# ============================================================
# 3. Random Forest
# ============================================================
rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=2,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=0
)

print("\nTraining Random Forest...")
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_val)
acc_rf = accuracy_score(y_val, y_pred_rf)

print("\nRF Accuracy:", acc_rf)
print(classification_report(y_val, y_pred_rf, target_names=le.classes_))

# ============================================================
# 4. Tuned Bagging SVM (max performance)
# ============================================================
bag_svm = BaggingClassifier(
    estimator=SVC(
        C=50,                  # increased C
        gamma=0.0005,           # smaller gamma for smoother decision
        kernel='rbf',           # best kernel from tuning
        probability=True,
        random_state=0
    ),
    n_estimators=30,           # more bags
    max_samples=0.9,           # use more data per bag
    bootstrap=True,
    n_jobs=-1,
    random_state=0
)

print("\nTraining Bagging SVM...")
bag_svm.fit(X_train_res, y_train_res)
y_pred_bag_svm = bag_svm.predict(X_val)
acc_bag_svm = accuracy_score(y_val, y_pred_bag_svm)

print("\nBagging SVM Accuracy:", acc_bag_svm)
print(classification_report(y_val, y_pred_bag_svm, target_names=le.classes_))

# ============================================================
# 5. Bagging MLP
# ============================================================
bag_mlp = BaggingClassifier(
    estimator=MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation='tanh',
        solver='adam',
        alpha=0.001,
        learning_rate_init=0.0005,
        max_iter=500,
        random_state=0
    ),
    n_estimators=10,
    max_samples=1.0,    # use all samples
    bootstrap=False,    # no bootstrap
    n_jobs=-1,
    random_state=0
)

print("\nTraining Bagging MLP...")
bag_mlp.fit(X_train_res, y_train_res)
y_pred_bag_mlp = bag_mlp.predict(X_val)
acc_bag_mlp = accuracy_score(y_val, y_pred_bag_mlp)

print("\nBagging MLP Accuracy:", acc_bag_mlp)
print(classification_report(y_val, y_pred_bag_mlp, target_names=le.classes_))

# ============================================================
# 6. AdaBoost
# ============================================================
boost = AdaBoostClassifier(
    n_estimators=300,
    learning_rate=0.5,
    random_state=0
)

print("\nTraining AdaBoost...")
boost.fit(X_train_res, y_train_res)
y_pred_boost = boost.predict(X_val)
acc_boost = accuracy_score(y_val, y_pred_boost)

print("\nAdaBoost Accuracy:", acc_boost)
print(classification_report(y_val, y_pred_boost, target_names=le.classes_))

# ============================================================
# 7. Voting Ensemble
# ============================================================
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('mlp', mlp),
        ('rf', rf),
        ('bag_svm', bag_svm),
        ('bag_mlp', bag_mlp),
        ('boost', boost)
    ],
    voting='soft',
    n_jobs=-1
)

print("\nTraining Ensemble...")
ensemble.fit(X_train_res, y_train_res)
y_pred_ens = ensemble.predict(X_val)
acc_ens = accuracy_score(y_val, y_pred_ens)

print("\nENSEMBLE Accuracy:", acc_ens)
print(classification_report(y_val, y_pred_ens, target_names=le.classes_))

# ============================================================
# Predict on Test
# ============================================================
X_test = poly.transform(test[['signal_strength', 'response_level']])
X_test = scaler.transform(X_test)

models = {
    "svm": svm,
    "mlp": mlp,
    "random_forest": rf,
    "bagging_svm": bag_svm,
    "bagging_mlp": bag_mlp,
    "adaboost": boost,
    "ensemble": ensemble
}

for name, model in models.items():
    preds = model.predict(X_test)
    preds = le.inverse_transform(preds)
    pd.DataFrame({
        "sample_id": test["sample_id"],
        "category": preds
    }).to_csv(f"{name}_predictions_02.csv", index=False)

# ============================================================
# Accuracy Summary
# ============================================================
print("\n---------------- Accuracy Summary ----------------")
print("SVM:", acc_svm)
print("MLP:", acc_mlp)
print("RandomForest:", acc_rf)
print("Bagging SVM:", acc_bag_svm)
print("Bagging MLP:", acc_bag_mlp)
print("AdaBoost:", acc_boost)
print("Ensemble:", acc_ens)
print("--------------------------------------------------")


Train shape: (1444, 4)
Category counts:
 category
Group_B    709
Group_C    481
Group_A    254
Name: count, dtype: int64

Original distribution: Counter({np.int64(1): 567, np.int64(2): 385, np.int64(0): 203})
Resampled distribution: Counter({np.int64(1): 567, np.int64(2): 567, np.int64(0): 567})

Training SVM with multi-kernel search...
Best SVM params: {'C': 20, 'gamma': 'scale', 'kernel': 'rbf'}

SVM Accuracy: 1.0
              precision    recall  f1-score   support

     Group_A       1.00      1.00      1.00        51
     Group_B       1.00      1.00      1.00       142
     Group_C       1.00      1.00      1.00        96

    accuracy                           1.00       289
   macro avg       1.00      1.00      1.00       289
weighted avg       1.00      1.00      1.00       289


Training MLP...

MLP Accuracy: 0.9965397923875432
              precision    recall  f1-score   support

     Group_A       0.98      1.00      0.99        51
     Group_B       1.00      1.00      

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter

# ============================================================
# Load data
# ============================================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Category counts:\n", train['category'].value_counts())

# ============================================================
# Prepare features
# ============================================================
X = train[['signal_strength', 'response_level']]
y = train['category']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Polynomial features (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_encoded, test_size=0.80, random_state=0, stratify=y_encoded
)

# SMOTE oversampling
print("\nOriginal distribution:", Counter(y_train))
smote = SMOTE(random_state=0)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Resampled distribution:", Counter(y_train_res))

# ============================================================
# 1. SVM with multi-kernel GridSearch
# ============================================================
svm_params = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 20, 50]},
    {'kernel': ['poly'], 'C': [1, 10, 20], 'degree': [2, 3, 4], 'gamma': ['scale', 0.01, 0.001]},
    {'kernel': ['rbf'], 'C': [1, 10, 20], 'gamma': ['scale', 0.01, 0.001]}
]

grid_svm = GridSearchCV(
    SVC(probability=True, random_state=0),
    svm_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print("\nTraining SVM with multi-kernel search...")
grid_svm.fit(X_train_res, y_train_res)
svm = grid_svm.best_estimator_

y_pred_svm = svm.predict(X_val)
acc_svm = accuracy_score(y_val, y_pred_svm)

print("Best SVM params:", grid_svm.best_params_)
print("\nSVM Accuracy:", acc_svm)
print(classification_report(y_val, y_pred_svm, target_names=le.classes_))

# ============================================================
# 2. Improved MLP
# ============================================================
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='tanh',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.0005,
    max_iter=700,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=20,
    random_state=0
)

print("\nTraining MLP...")
mlp.fit(X_train_res, y_train_res)
y_pred_mlp = mlp.predict(X_val)
acc_mlp = accuracy_score(y_val, y_pred_mlp)

print("\nMLP Accuracy:", acc_mlp)
print(classification_report(y_val, y_pred_mlp, target_names=le.classes_))

# ============================================================
# 3. Random Forest
# ============================================================
rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=2,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=0
)

print("\nTraining Random Forest...")
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_val)
acc_rf = accuracy_score(y_val, y_pred_rf)

print("\nRF Accuracy:", acc_rf)
print(classification_report(y_val, y_pred_rf, target_names=le.classes_))

# ============================================================
# 4. Tuned Bagging SVM (max performance)
# ============================================================
bag_svm = BaggingClassifier(
    estimator=SVC(
        C=50,                  # increased C
        gamma=0.0005,           # smaller gamma for smoother decision
        kernel='rbf',           # best kernel from tuning
        probability=True,
        random_state=0
    ),
    n_estimators=30,           # more bags
    max_samples=0.9,           # use more data per bag
    bootstrap=True,
    n_jobs=-1,
    random_state=0
)

print("\nTraining Bagging SVM...")
bag_svm.fit(X_train_res, y_train_res)
y_pred_bag_svm = bag_svm.predict(X_val)
acc_bag_svm = accuracy_score(y_val, y_pred_bag_svm)

print("\nBagging SVM Accuracy:", acc_bag_svm)
print(classification_report(y_val, y_pred_bag_svm, target_names=le.classes_))

# ============================================================
# 5. Bagging MLP
# ============================================================
bag_mlp = BaggingClassifier(
    estimator=MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation='tanh',
        solver='adam',
        alpha=0.001,
        learning_rate_init=0.0005,
        max_iter=500,
        random_state=0
    ),
    n_estimators=10,
    max_samples=1.0,    # use all samples
    bootstrap=False,    # no bootstrap
    n_jobs=-1,
    random_state=0
)

print("\nTraining Bagging MLP...")
bag_mlp.fit(X_train_res, y_train_res)
y_pred_bag_mlp = bag_mlp.predict(X_val)
acc_bag_mlp = accuracy_score(y_val, y_pred_bag_mlp)

print("\nBagging MLP Accuracy:", acc_bag_mlp)
print(classification_report(y_val, y_pred_bag_mlp, target_names=le.classes_))

# ============================================================
# 6. AdaBoost
# ============================================================
boost = AdaBoostClassifier(
    n_estimators=300,
    learning_rate=0.5,
    random_state=0
)

print("\nTraining AdaBoost...")
boost.fit(X_train_res, y_train_res)
y_pred_boost = boost.predict(X_val)
acc_boost = accuracy_score(y_val, y_pred_boost)

print("\nAdaBoost Accuracy:", acc_boost)
print(classification_report(y_val, y_pred_boost, target_names=le.classes_))

# ============================================================
# 7. Voting Ensemble
# ============================================================
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('mlp', mlp),
        ('rf', rf),
        ('bag_svm', bag_svm),
        ('bag_mlp', bag_mlp),
        ('boost', boost)
    ],
    voting='soft',
    n_jobs=-1
)

print("\nTraining Ensemble...")
ensemble.fit(X_train_res, y_train_res)
y_pred_ens = ensemble.predict(X_val)
acc_ens = accuracy_score(y_val, y_pred_ens)

print("\nENSEMBLE Accuracy:", acc_ens)
print(classification_report(y_val, y_pred_ens, target_names=le.classes_))

# ============================================================
# Predict on Test
# ============================================================
X_test = poly.transform(test[['signal_strength', 'response_level']])
X_test = scaler.transform(X_test)

models = {
    "svm": svm,
    "mlp": mlp,
    "random_forest": rf,
    "bagging_svm": bag_svm,
    "bagging_mlp": bag_mlp,
    "adaboost": boost,
    "ensemble": ensemble
}

for name, model in models.items():
    preds = model.predict(X_test)
    preds = le.inverse_transform(preds)
    pd.DataFrame({
        "sample_id": test["sample_id"],
        "category": preds
    }).to_csv(f"{name}_predictions_08.csv", index=False)

# ============================================================
# Accuracy Summary
# ============================================================
print("\n---------------- Accuracy Summary ----------------")
print("SVM:", acc_svm)
print("MLP:", acc_mlp)
print("RandomForest:", acc_rf)
print("Bagging SVM:", acc_bag_svm)
print("Bagging MLP:", acc_bag_mlp)
print("AdaBoost:", acc_boost)
print("Ensemble:", acc_ens)
print("--------------------------------------------------")


Train shape: (1444, 4)
Category counts:
 category
Group_B    709
Group_C    481
Group_A    254
Name: count, dtype: int64

Original distribution: Counter({np.int64(1): 141, np.int64(2): 96, np.int64(0): 51})
Resampled distribution: Counter({np.int64(0): 141, np.int64(1): 141, np.int64(2): 141})

Training SVM with multi-kernel search...
Best SVM params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

SVM Accuracy: 0.9852941176470589
              precision    recall  f1-score   support

     Group_A       0.93      1.00      0.96       203
     Group_B       1.00      0.99      0.99       568
     Group_C       1.00      0.97      0.99       385

    accuracy                           0.99      1156
   macro avg       0.97      0.99      0.98      1156
weighted avg       0.99      0.99      0.99      1156


Training MLP...

MLP Accuracy: 0.9022491349480969
              precision    recall  f1-score   support

     Group_A       0.66      0.92      0.77       203
     Group_B       0.99   




Bagging MLP Accuracy: 0.9749134948096886
              precision    recall  f1-score   support

     Group_A       0.89      0.98      0.93       203
     Group_B       0.99      0.98      0.99       568
     Group_C       1.00      0.97      0.98       385

    accuracy                           0.97      1156
   macro avg       0.96      0.97      0.97      1156
weighted avg       0.98      0.97      0.98      1156


Training AdaBoost...

AdaBoost Accuracy: 0.8347750865051903
              precision    recall  f1-score   support

     Group_A       0.53      0.53      0.53       203
     Group_B       0.86      0.94      0.90       568
     Group_C       0.98      0.84      0.90       385

    accuracy                           0.83      1156
   macro avg       0.79      0.77      0.78      1156
weighted avg       0.84      0.83      0.83      1156


Training Ensemble...





ENSEMBLE Accuracy: 0.9775086505190311
              precision    recall  f1-score   support

     Group_A       0.90      0.99      0.94       203
     Group_B       1.00      0.98      0.99       568
     Group_C       0.99      0.97      0.98       385

    accuracy                           0.98      1156
   macro avg       0.96      0.98      0.97      1156
weighted avg       0.98      0.98      0.98      1156


---------------- Accuracy Summary ----------------
SVM: 0.9852941176470589
MLP: 0.9022491349480969
RandomForest: 0.9757785467128027
Bagging SVM: 0.9247404844290658
Bagging MLP: 0.9749134948096886
AdaBoost: 0.8347750865051903
Ensemble: 0.9775086505190311
--------------------------------------------------
