In [1]:
# !pip install scikit-learn==1.3.0 numpy==1.25.2

In [2]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

In [3]:
corpus = [
    "Góp gió gặt bão",
    "Ở hiền gặp lành",
    "Đất lành chim đậu",
    "Ăn cháo đá bát",
    "Cạp đất mà ăn",
    "Qua cầu rút ván"
]

n_doc = len(corpus)

labels = [0, 0, 0, 1, 1, 1] # 0: positive - 1: negative

cate_2_label = {
    "positive": 0,
    "negative": 1
}

In [4]:
def label_2_cate(labels):
    key_list = list(cate_2_label.keys())
    val_list = list(cate_2_label.values())

    position = [val_list.index(label) for label in labels]
    return np.array(key_list)[position]

In [5]:
X = np.array(corpus)
y = np.array(labels)

## Convert text to vector by using TF-IDF transform

In [6]:
def caculate_tfidf(X_vectorized):
    tf = np.log(X_vectorized + 1)

    df = np.sum(X_vectorized, axis= 0)

    idf = np.log((n_doc+1)/(df+1)) + 1

    tfidf = tf * idf

    return idf, tf, tfidf

In [7]:
def compute_norm(tfidf_vec):
    norm = np.linalg.norm(tfidf_vec, axis = 1)
    n_doc = tfidf_vec.shape[0]
    for i in range(n_doc):
        tfidf_vec[i] /=  norm[i]

In [8]:
vectorizer = CountVectorizer()

X_vectorized = vectorizer.fit_transform(X).toarray()

print("Vocab: ", vectorizer.get_feature_names_out())

Vocab:  ['bát' 'bão' 'chim' 'cháo' 'cạp' 'cầu' 'gió' 'góp' 'gặp' 'gặt' 'hiền'
 'lành' 'mà' 'qua' 'rút' 'ván' 'ăn' 'đá' 'đất' 'đậu']


In [9]:
X_idf, x_tf, X_tfidf = caculate_tfidf(X_vectorized)

### Normlize TF-IDF values by L2 norm

In [10]:
compute_norm(X_tfidf)

# Train model KNN with 1 neighbor

In [11]:
knn_cls = KNeighborsClassifier(n_neighbors=1)
knn_cls.fit(X_tfidf, y)
preds = knn_cls.predict(X_tfidf)
print(preds)

[0 0 0 1 1 1]


## Using pipeline of sklearn

In [12]:
text_clf_model = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', KNeighborsClassifier(n_neighbors=1)),
                        ])

text_clf_model.fit(X, y)

preds = text_clf_model.predict(X)
print(preds)

[0 0 0 1 1 1]


### Inference

In [13]:
test_text = np.array(["Đời cha ăn mặn, đời con khát nước"])
test_vec = vectorizer.transform(test_text).toarray()

In [14]:
test_tf = np.log(test_vec + 1)
test_tfidf = test_tf * X_idf

In [15]:
compute_norm(test_tfidf)

In [16]:
pred = knn_cls.predict(test_tfidf)

print(label_2_cate(pred))

['negative']
