In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
df = pd.read_csv(r"D:\BigData And DataMining\Đồ án\TH2\heart_cleveland_upload.csv")
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
5,64,1,0,170,227,0,2,155,0,0.6,1,0,2,0
6,63,1,0,145,233,1,2,150,0,2.3,2,0,1,0
7,61,1,0,134,234,0,0,145,0,2.6,1,2,0,1
8,60,0,0,150,240,0,0,171,0,0.9,0,0,0,0
9,59,1,0,178,270,0,2,145,0,4.2,2,0,2,0


In [3]:
# Tách đặc trưng và nhãn
X = df.drop('condition', axis=1)
y = df['condition']

In [4]:
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [5]:
# Khởi tạo mô hình
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

# Huấn luyện và đánh giá mô hình
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))


Decision Tree Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.80      0.75      0.77        32
           1       0.73      0.79      0.76        28

    accuracy                           0.77        60
   macro avg       0.77      0.77      0.77        60
weighted avg       0.77      0.77      0.77        60


Random Forest Accuracy: 0.70
              precision    recall  f1-score   support

           0       0.73      0.69      0.71        32
           1       0.67      0.71      0.69        28

    accuracy                           0.70        60
   macro avg       0.70      0.70      0.70        60
weighted avg       0.70      0.70      0.70        60


SVM Accuracy: 0.73
              precision    recall  f1-score   support

           0       0.79      0.69      0.73        32
           1       0.69      0.79      0.73        28

    accuracy                           0.73        60
   macro avg       0.74      0.74      0.73     

In [7]:

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from deap import base, creator, tools, algorithms
import random
import warnings
warnings.filterwarnings("ignore")
# 3. GA-SVM setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
# C range: 0.1 to 1000, gamma range: 1e-5 to 1
toolbox.register("C", random.uniform, 0.1, 1000)
toolbox.register("gamma", random.uniform, 1e-5, 1)
toolbox.register("individual", tools.initCycle, creator.Individual, (toolbox.C, toolbox.gamma), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Fitness function: SVM cross-validation accuracy
def eval_svm(individual):
    C, gamma = individual
    clf = SVC(C=C, gamma=gamma, kernel='rbf')
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    return (score,)

toolbox.register("evaluate", eval_svm)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.2, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# 4. Run GA
# 4. Run GA
population = toolbox.population(n=20)
NGEN = 10
for gen in range(NGEN):
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.2)

    # ✅ Giới hạn C và gamma sau đột biến
    for ind in offspring:
        ind[0] = max(ind[0], 0.1)       # C
        ind[1] = max(ind[1], 1e-5)      # gamma

    fits = list(map(toolbox.evaluate, offspring))
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit

    population = toolbox.select(offspring, k=len(population))


# 5. Best result
top_ind = tools.selBest(population, k=1)[0]
print(f"\nBest SVM hyperparameters: C={top_ind[0]:.3f}, gamma={top_ind[1]:.5f}")

# 6. Train and test final model
best_model = SVC(C=top_ind[0], gamma=top_ind[1], kernel='rbf')
best_model.fit(X_train, y_train)
test_acc = best_model.score(X_test, y_test)
print(f"Test Accuracy with GA-optimized SVM: {test_acc:.2f}")


Best SVM hyperparameters: C=483.337, gamma=0.23027
Test Accuracy with GA-optimized SVM: 0.70
