# Đọc dữ liệu

In [1]:
import pandas as pd
path = './dataset/training.xlsx'
# read by default 1st sheet of an excel file
df = pd.read_excel(path)
df.columns = df.columns.str.lower()

In [2]:
df.head()

Unnamed: 0,no,year,month,day,hour,pm2.5,pm10,so2,no2,co,o3,temp,pres,dewp,rain,wd,wspm,station,label
0,1,2013,3,1,0,4,4,4,7,300,77,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin,Good
1,2,2013,3,1,1,8,8,4,7,300,77,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin,Good
2,3,2013,3,1,2,7,7,5,10,300,73,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin,Good
3,4,2013,3,1,3,6,6,11,11,300,72,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin,Good
4,5,2013,3,1,4,3,3,12,12,300,72,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin,Good


# Loại bỏ các trường không cần thiết

In [3]:
X = df.drop(columns=['label', 'station', 'no', 'month', 'year', 'day', 'hour', 'wd'])
y = df[['label']]

In [4]:
X

Unnamed: 0,pm2.5,pm10,so2,no2,co,o3,temp,pres,dewp,rain,wspm
0,4,4,4,7,300,77,-0.7,1023.0,-18.8,0.0,4.4
1,8,8,4,7,300,77,-1.1,1023.2,-18.2,0.0,4.7
2,7,7,5,10,300,73,-1.1,1023.5,-18.2,0.0,5.6
3,6,6,11,11,300,72,-1.4,1024.5,-19.4,0.0,3.1
4,3,3,12,12,300,72,-2.0,1025.2,-19.5,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
477,187,202,38,98,1600,26,7.7,1000.0,-0.8,0.0,1.4
478,188,205,40,107,1600,22,7.7,999.8,-0.8,0.0,1.7
479,187,211,40,114,1600,20,6.9,1000.2,-0.1,0.0,2.3
480,190,221,34,122,1600,15,6.1,1001.4,0.0,0.0,1.1


In [5]:
y.value_counts()

label                         
Unhealthy                         125
Very Unhealthy                     94
Hazardous                          89
Moderate                           87
Unhealthy for Sensitive Groups     47
Good                               40
Name: count, dtype: int64

# Chia bộ test
lấy 80% dữ liệu để huấn luyện mô hình, 20% dùng để test

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

# Chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Khoảng cách eclidean

In [8]:
def euclidean_distance(x, y):
    if len(x) != len(y):
        return None
    squared_distance = 0
    for i in range(len(x)):
        squared_distance += (x[i] - y[i]) ** 2
    return squared_distance ** 0.5

# Độ tương đồng cosine

In [9]:
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

# Hàm KNN

In [10]:
def KNN(X_train,y_train, X_test, k = 5, metric = 'cosine'):
    y_pred = []
    for test_point in X_test:
        neighbors = []
        if metric == 'cosine':
            for i, train_point in enumerate(X_train):
                similarity = cosine_similarity(test_point, train_point)
                neighbors.append((similarity, y_train[i]))
            neighbors.sort(reverse=True)
        else: 
            for i, train_point in enumerate(X_train):
                distance = euclidean_distance(test_point, train_point)
                neighbors.append((distance, y_train[i]))
                neighbors.sort()
        top_neighbors = neighbors[:k]
        classes = [neighbor[1] for neighbor in top_neighbors]
        predicted_class = max(set(classes), key=classes.count)
        y_pred.append(predicted_class)
    return y_pred

# Sử dụng hàm tự build

In [11]:
y_pred = KNN(X_train, y_train, X_test, 5, "eclidean")
y_pred

['Unhealthy',
 'Very Unhealthy',
 'Unhealthy',
 'Moderate',
 'Unhealthy for Sensitive Groups',
 'Hazardous',
 'Very Unhealthy',
 'Unhealthy for Sensitive Groups',
 'Very Unhealthy',
 'Unhealthy',
 'Good',
 'Hazardous',
 'Very Unhealthy',
 'Very Unhealthy',
 'Moderate',
 'Unhealthy',
 'Moderate',
 'Moderate',
 'Unhealthy',
 'Moderate',
 'Unhealthy',
 'Very Unhealthy',
 'Moderate',
 'Unhealthy',
 'Moderate',
 'Hazardous',
 'Very Unhealthy',
 'Unhealthy',
 'Moderate',
 'Unhealthy for Sensitive Groups',
 'Unhealthy for Sensitive Groups',
 'Unhealthy',
 'Unhealthy for Sensitive Groups',
 'Moderate',
 'Good',
 'Good',
 'Very Unhealthy',
 'Hazardous',
 'Hazardous',
 'Moderate',
 'Hazardous',
 'Moderate',
 'Unhealthy',
 'Unhealthy',
 'Moderate',
 'Moderate',
 'Unhealthy',
 'Very Unhealthy',
 'Unhealthy for Sensitive Groups',
 'Moderate',
 'Unhealthy for Sensitive Groups',
 'Hazardous',
 'Unhealthy',
 'Very Unhealthy',
 'Very Unhealthy',
 'Unhealthy for Sensitive Groups',
 'Moderate',
 'Unhealt

In [12]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8247422680412371

In [13]:
new_data = [[4,	4,	4,	7,	300,	77,	-0.7,	1023.0,	-18.8,	2,	4.4]]

new_data_df = pd.DataFrame(data=new_data, columns=X.columns)


new_data_scaled = scaler.transform(new_data_df)

new_label_prediction = KNN(X_train, y_train, new_data_scaled, 5, 'cosine')
new_label_prediction

['Unhealthy for Sensitive Groups']

# Sử dụng hàm KNN của sklearn

In [14]:
from sklearn.neighbors import KNeighborsClassifier

# Khởi tạo mô hình KNN
knn = KNeighborsClassifier(n_neighbors=5)

# Huấn luyện mô hình
knn.fit(X_train, y_train)

In [15]:
from sklearn.metrics import accuracy_score

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8144329896907216

In [16]:
new_data = [[4,	4,	4,	7,	300,	77,	-0.7,	1023.0,	-18.8,	2,	4.4]]

new_data_df = pd.DataFrame(data=new_data, columns=X.columns)

new_data_scaled = scaler.transform(new_data_df)

new_label_prediction = knn.predict(new_data_scaled)
new_label_prediction

array(['Unhealthy for Sensitive Groups'], dtype=object)