# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [2]:
dataset = pd.read_csv(r'D20_KNN實作_datasets.csv',
                      usecols=['v1','v2'],
                      encoding = 'latin-1')
dataset['v3'] = [1 if i == 'spam' else 0 for i in dataset['v1']]
all_data = dataset[['v2','v3']].to_numpy()

[['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
  0]
 ['Ok lar... Joking wif u oni...' 0]
 ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
  1]
 ...
 ['Pity, * was in mood for that. So...any other suggestions?' 0]
 ["The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
  0]
 ['Rofl. Its true to its name' 0]]


### 取出訓練內文與標註

In [3]:
X = all_data[:,0]
Y = all_data[:,1].astype(np.uint8)
print(f'Training Data Examples : \n{X[:5]}\n')
print(f'Labeling Data Examples : \n{Y[:5]}')

Training Data Examples : 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]

Labeling Data Examples : 
[0 0 1 0 0]


### 文字預處理

In [4]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

## 創建Lemmatizer
lemmatizer = WordNetLemmatizer() 
def get_wordnet_pos(word):
    """將pos_tag結果mapping到lemmatizer中pos的格式"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def clean_content(X):
    # remove non-alphabet characters
    X_clean = [re.sub('[^a-zA-Z]',' ', x).lower() for x in X]
    # tokenize
    X_word_tokenize = [nltk.word_tokenize(x) for x in X_clean]
    # stopwords_lemmatizer
    X_stopwords_lemmatizer = []
    stop_words = set(stopwords.words('english'))
    for content in X_word_tokenize:
        content_clean = []
        for word in content:
            if word not in stop_words:
                word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
                content_clean.append(word)
        X_stopwords_lemmatizer.append(content_clean)
    X_output = [' '.join(x) for x in X_stopwords_lemmatizer]
    return X_output

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
X = clean_content(X)

### Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(X).toarray()
print(X.shape)

(5572, 1500)


## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the K-NN model on the Training set

In [8]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train, Y_train)

KNeighborsClassifier()

## Predicting a new result

In [9]:
for i in ['train', 'test']:
    print(f'{i.capitalize()}set Accuracy: {classifier.score(globals()["X_"+i], globals()["Y_"+i])}')

Trainset Accuracy: 0.943459726273278
Testset Accuracy: 0.9183856502242153


## Predicting the Test set results

In [10]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, y_pred)
print(cm)
accuracy_score(Y_test, y_pred)

[[949   0]
 [ 91  75]]


0.9183856502242153