In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

## Chuẩn bị dataset

In [33]:
wine_quality = fetch_ucirepo(id=186)

In [34]:
data = wine_quality.data.original
data.head(5)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


## Chuẩn hóa dữ liệu

In [35]:
# One-Hot Encoding
data["is_red"] = np.where(data["color"] == "red", 1, 0)
data["is_white"] = np.where(data["color"] == "white", 1, 0)

data = data.drop(["color"], axis=1)
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,is_red,is_white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0


In [36]:
# MinMaxScaler
for label in data.columns[:11]:
    min_value = data[label].min()
    max_value = data[label].max()

    data[label] = (data[label] - min_value) / (max_value - min_value)

data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,is_red,is_white
0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,5,1,0
1,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,5,1,0
2,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,5,1,0
3,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,6,1,0
4,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,5,1,0


# Chia data

In [44]:
# Tính vị trí cắt
data_size = int(0.8 * len(data))

# Chia data
train_set = data.iloc[:data_size]
test_set = data.iloc[data_size:]

# Tách đặc trưng
X_train = train_set.drop("quality", axis = 1).values
y_train = train_set["quality"].values

X_test = test_set.drop("quality", axis = 1).values
y_test = test_set["quality"].values

## Tính khoảng cách

In [45]:
# Số điểm
k = 5

predictions = []

for test_row in X_test:
    # Tính khoảng cách
    distance = np.sqrt(np.sum((test_row - X_train)**2, axis=1))

    # Tìm K ae gần nhất
    nearest_ids = np.argsort(distance)[:k]
    nearest_label = y_train[nearest_ids]

    # Lấy label xuất hiện nhiều nhất
    pred = pd.Series(nearest_label).mode()[0]
    predictions.append(pred)

## Kiểm tra độ chính xác

In [46]:
accuracy = np.mean(predictions == y_test)
print(f"Độ chính xác của mô hình: {accuracy * 100}%") 

Độ chính xác của mô hình: 49.46153846153846%
