# k-Nearest Neighbors (kNN)

## Load Libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier

## Load Processed Data


In [4]:
# Load the data
data = pd.read_csv(r'../resources/processed_data.csv')
data.head()

Unnamed: 0,Processed_Review,sentiment
0,one review mention watch oz episod hook right ...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter mattei love time money visual stun film...,positive


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [6]:
# Chuyển dữ liệu sang đặc trưng số sử dụng TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2)) 

In [7]:
# Chia data thành các tập train và test
X = tfidf.fit_transform(data['Processed_Review']).toarray()
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Distance Calculation Functions


In [8]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

def minkowski_distance(x1, x2, p=3):
    return np.power(np.sum(np.abs(x1 - x2) ** p), 1/p)


## Train-Test Split


## Grid Search for Optimal k


In [None]:
param_grid = {'n_neighbors': [3, 5, 7, 15],
              'metric': ['euclidean', 'manhattan', 'minkowski']
              }
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'Best k: {grid_search.best_params_}')
