In [1]:
from pandas import read_csv
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
dataset = read_csv(url, header=None)
print(dataset.shape)

(208, 61)


In [2]:
print(dataset.describe())
print(dataset.groupby(60).size())

               0           1           2           3           4           5   \
count  208.000000  208.000000  208.000000  208.000000  208.000000  208.000000   
mean     0.029164    0.038437    0.043832    0.053892    0.075202    0.104570   
std      0.022991    0.032960    0.038428    0.046528    0.055552    0.059105   
min      0.001500    0.000600    0.001500    0.005800    0.006700    0.010200   
25%      0.013350    0.016450    0.018950    0.024375    0.038050    0.067025   
50%      0.022800    0.030800    0.034300    0.044050    0.062500    0.092150   
75%      0.035550    0.047950    0.057950    0.064500    0.100275    0.134125   
max      0.137100    0.233900    0.305900    0.426400    0.401000    0.382300   

               6           7           8           9   ...          50  \
count  208.000000  208.000000  208.000000  208.000000  ...  208.000000   
mean     0.121747    0.134799    0.178003    0.208259  ...    0.016069   
std      0.061788    0.085152    0.118387    0.1

In [3]:
from sklearn.model_selection import train_test_split
array = dataset.values
X = array[:, 0:60]
Y = array[:, 60]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [4]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
scoring = 'accuracy'
results = []
for name, model in models:
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print(f'{name}: {cv_results.mean():.3f} ({cv_results.std():.3f})')

LR: 0.772 (0.091)
LDA: 0.779 (0.094)
KNN: 0.759 (0.106)
CART: 0.745 (0.094)
NB: 0.683 (0.136)
SVM: 0.765 (0.088)


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
# Lặp lại cross_val_score trên X_train_scaled

In [6]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01]}
model = SVC()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X_train_scaled, Y_train)
print(grid.best_score_)

0.8310160427807487


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

ensembles = []
ensembles.append(('RF', RandomForestClassifier(n_estimators=10)))
ensembles.append(('GBM', GradientBoostingClassifier(n_estimators=10)))
# Tương tự cross_val_score

In [8]:
model = SVC(C=1, gamma=0.1)
model.fit(X_train_scaled, Y_train)
predictions = model.predict(scaler.transform(X_validation))
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_validation, predictions))

# Save với pickle (Chapter 17)
import pickle
with open('sonar_model.pkl', 'wb') as f:
    pickle.dump(model, f)

0.8571428571428571
