# Đọc dữ liệu

In [25]:
import pandas as pd
path = './dataset/processed.csv'
df = pd.read_csv(path)

In [26]:
df.head()

Unnamed: 0,pm2.5,pm10,so2,no2,co,dewp,wspm,label
0,4.0,4.0,4.0,7.0,300.0,-18.8,4.4,0.0
1,8.0,8.0,4.0,7.0,300.0,-18.2,4.7,0.0
2,7.0,7.0,5.0,10.0,300.0,-18.2,5.6,0.0
3,6.0,6.0,11.0,11.0,300.0,-19.4,3.1,0.0
4,3.0,3.0,12.0,12.0,300.0,-19.5,2.0,0.0


# Chia thuộc tính và nhãn

In [27]:
X = df.drop(columns=['label'])
y = df[['label']]

In [28]:
X

Unnamed: 0,pm2.5,pm10,so2,no2,co,dewp,wspm
0,4.0,4.0,4.0,7.0,300.0,-18.8,4.4
1,8.0,8.0,4.0,7.0,300.0,-18.2,4.7
2,7.0,7.0,5.0,10.0,300.0,-18.2,5.6
3,6.0,6.0,11.0,11.0,300.0,-19.4,3.1
4,3.0,3.0,12.0,12.0,300.0,-19.5,2.0
...,...,...,...,...,...,...,...
35039,12.0,29.0,5.0,35.0,400.0,-16.2,2.4
35040,13.0,37.0,7.0,45.0,500.0,-15.1,0.9
35041,16.0,37.0,10.0,66.0,700.0,-13.3,1.1
35042,21.0,44.0,12.0,87.0,700.0,-12.9,1.2


In [29]:
y.value_counts()

label
3.0      10677
1.0       9535
4.0       5229
2.0       4522
0.0       2603
5.0       2478
Name: count, dtype: int64

# Chia bộ test
lấy 80% dữ liệu để huấn luyện mô hình, 20% dùng để test

In [30]:
from sklearn.model_selection import train_test_split
import numpy as np

# Chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [32]:
X_train

array([[-0.66156052, -0.15919278, -0.33458603, ..., -0.29639074,
         0.832604  , -0.50265446],
       [-0.58878793, -0.56763846, -0.68346717, ..., -0.45840577,
         1.24957203, -0.00428125],
       [ 2.17657052,  1.2965495 , -0.11653531, ...,  0.02763933,
         0.81065831, -1.00102766],
       ...,
       [-0.71007558, -0.90277337, -0.59624689, ..., -0.94445087,
         1.09595223, -1.41633866],
       [-0.97690841, -1.10175973, -0.63985703, ..., -0.78243584,
        -0.86452903,  1.98921157],
       [-0.11576609,  0.51107704,  0.79927769, ...,  0.35166939,
        -0.79137674, -0.50265446]])

# Khoảng cách eclidean

In [33]:
def euclidean_distance(x, y): # O(n)
    if len(x) != len(y):
        return None
    squared_distance = 0
    for i in range(len(x)): 
        squared_distance += (x[i] - y[i]) ** 2
    return squared_distance ** 0.5

# Độ tương đồng cosine

In [34]:
def cosine_similarity(x, y): # O(n)
    # np.dot: O(n)
    # np.norm: O(n)
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

# Hàm KNN

In [35]:
def KNN(X_train,y_train, X_test, k = 5, metric = 'cosine'):
    # X_train: m*n
    # y_train: m*1
    # X_test: m2*n
    # k: số lượng neighbors <=m
    # metric: 'cosine' or 'eclidean'
    y_pred = []
    # for: O(m2(n(logn + m) + m))
    for test_point in X_test: # O(m2)
        # test_point: n
        neighbors = []
        if metric == 'cosine':
            # for: O(n(logn + m))
            for i, train_point in enumerate(X_train):  # O(m)
                # train_point: n
                similarity = cosine_similarity(test_point, train_point) # O(n)
                neighbors.append((similarity, y_train[i])) 
            neighbors.sort(reverse=True) # O(nlogn)
        else: 
            # for: O(n(logn + m))
            for i, train_point in enumerate(X_train): # O(m)
                # train_point: n
                distance = euclidean_distance(test_point, train_point) # O(n)
                neighbors.append((distance, y_train[i]))
            neighbors.sort() # O(nlogn)
        top_neighbors = neighbors[:k]
        classes = [neighbor[1] for neighbor in top_neighbors] # O(m)
        predicted_class = max(set(classes), key=classes.count) # O(m)
        y_pred.append(predicted_class)
    return y_pred

# Sử dụng hàm tự build

In [36]:
y_pred = KNN(X_train, y_train, X_test, 5, "eclidean")
y_pred

[2.0,
 1.0,
 1.0,
 1.0,
 0.0,
 3.0,
 1.0,
 5.0,
 2.0,
 1.0,
 3.0,
 1.0,
 3.0,
 3.0,
 5.0,
 1.0,
 1.0,
 1.0,
 4.0,
 1.0,
 1.0,
 3.0,
 3.0,
 1.0,
 1.0,
 3.0,
 1.0,
 1.0,
 1.0,
 5.0,
 4.0,
 3.0,
 3.0,
 5.0,
 1.0,
 3.0,
 4.0,
 1.0,
 1.0,
 1.0,
 3.0,
 1.0,
 1.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 3.0,
 1.0,
 2.0,
 3.0,
 1.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 3.0,
 1.0,
 1.0,
 4.0,
 3.0,
 5.0,
 3.0,
 5.0,
 4.0,
 3.0,
 2.0,
 1.0,
 1.0,
 3.0,
 5.0,
 3.0,
 4.0,
 1.0,
 4.0,
 2.0,
 3.0,
 5.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0,
 3.0,
 4.0,
 4.0,
 2.0,
 2.0,
 3.0,
 3.0,
 2.0,
 3.0,
 3.0,
 1.0,
 2.0,
 2.0,
 3.0,
 1.0,
 2.0,
 0.0,
 3.0,
 3.0,
 2.0,
 4.0,
 3.0,
 4.0,
 3.0,
 1.0,
 1.0,
 1.0,
 3.0,
 2.0,
 1.0,
 2.0,
 3.0,
 4.0,
 2.0,
 2.0,
 2.0,
 3.0,
 1.0,
 4.0,
 3.0,
 5.0,
 5.0,
 1.0,
 2.0,
 3.0,
 5.0,
 1.0,
 2.0,
 3.0,
 0.0,
 4.0,
 1.0,
 1.0,
 4.0,
 2.0,
 4.0,
 2.0,
 3.0,
 1.0,
 3.0,
 2.0,
 3.0,
 1.0,
 1.0,
 1.0,
 2.0,
 3.0,
 1.0,
 1.0,
 3.0,
 1.0,
 4.0,
 1.0,
 1.0,
 2.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0

In [37]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8517620202596662

In [39]:
new_data = [[51.0,91.0,46.0,85.0,1000.0,-9.9,2.4]]

new_data_df = pd.DataFrame(data=new_data, columns=X.columns)


new_data_scaled = scaler.transform(new_data_df)

new_label_prediction = KNN(X_train, y_train, new_data_scaled, 5, 'eclidean')
new_label_prediction

[2.0]

# Sử dụng hàm KNN của sklearn

In [40]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

In [41]:
from sklearn.metrics import accuracy_score

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8517620202596662

In [42]:
new_data = [[51.0,91.0,46.0,85.0,1000.0,-9.9,2.4]]

new_data_df = pd.DataFrame(data=new_data, columns=X.columns)

new_data_scaled = scaler.transform(new_data_df)

new_label_prediction = knn.predict(new_data_scaled)
new_label_prediction

array([2.])