In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Step 1: Load .data file (no headers)
col_names = ['letter', 'x-box', 'y-box', 'width', 'height', 'onpix', 'x-bar', 'y-bar',
             'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
df = pd.read_csv("data/letter-recognition.csv", header=None, names=col_names)

# Step 2: Preprocess
X = df.drop("letter", axis=1)
y = df["letter"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Define grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3]
}

results = []
best_accuracy = 0
best_convergence_data = []
best_model = None

for sample_idx in range(1, 11):
    print(f"Processing sample {sample_idx}/10...")

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=sample_idx
    )

    svc = SVC()
    grid = GridSearchCV(
        estimator=svc,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid.fit(X_train, y_train)
    acc = accuracy_score(y_test, grid.predict(X_test))

    results.append({
        "Sample": sample_idx,
        "Accuracy": round(acc * 100, 2),
        "Best Kernel": grid.best_params_['kernel'],
        "Best C": grid.best_params_['C'],
        "Best Gamma": grid.best_params_['gamma'],
        "Best Degree": grid.best_params_.get('degree', '-')
    })

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = grid.best_estimator_
        best_sample = sample_idx

        best_convergence_data = []
        for i in range(1, 101):
            X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
                X_scaled, y, test_size=0.3, random_state=sample_idx + i
            )
            best_model.fit(X_train_i, y_train_i)
            acc_i = accuracy_score(y_test_i, best_model.predict(X_test_i))
            best_convergence_data.append(acc_i)

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("results/svm_results.csv", index=False)

# Plot convergence graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 101), best_convergence_data, marker='o', linewidth=1.5)
plt.title("Figure 1: Convergence graph of best SVM")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.grid(True)
plt.savefig("results/convergence_plot.png")
plt.show()

# Basic EDA
plt.figure(figsize=(12, 5))
sns.countplot(x="letter", data=df)
plt.title("Class Distribution")
