In [1]:
import pandas as pd
import numpy as np

In [15]:
data = pd.DataFrame({'komentar' : ["keren sekali",
                                   "mantap sekali",
                                   "uhuy sekali",
                                   "uhuy sekali",
                                   "uhuy sekali"],
                     'label' : ["0",
                                "0",
                                "0",
                                "0",
                                "0"]})

In [16]:
import numpy as np

class CustomTfidfVectorizer:
    def __init__(self):
        self.vocab = {}
        self.idf = {}
        self.doc_count = 0

    def fit(self, corpus):
        doc_term_freqs = []
        for doc in corpus:
            term_freqs = {}
            for term in doc.split():
                if term in term_freqs:
                    term_freqs[term] += 1
                else:
                    term_freqs[term] = 1
            doc_term_freqs.append(term_freqs)
            self.doc_count += 1

            for term in term_freqs:
                if term not in self.vocab:
                    self.vocab[term] = len(self.vocab)

        for term in self.vocab:
            doc_freq = sum(1 for doc in doc_term_freqs if term in doc)
            self.idf[term] = np.log(self.doc_count / (doc_freq + 1)) + 1  # Add 1 to avoid division by zero

    def transform(self, corpus):
        rows = []
        cols = []
        values = []

        for i, doc in enumerate(corpus):
            term_freqs = {}
            for term in doc.split():
                if term in term_freqs:
                    term_freqs[term] += 1
                else:
                    term_freqs[term] = 1
            doc_length = sum(term_freqs.values())

            for term, freq in term_freqs.items():
                if term in self.vocab:
                    rows.append(i)
                    cols.append(self.vocab[term])
                    tfidf = (freq / doc_length) * self.idf[term]
                    values.append(tfidf)

        matrix = np.zeros((len(corpus), len(self.vocab)))
        for row, col, value in zip(rows, cols, values):
            matrix[row, col] = value
        return matrix / np.linalg.norm(matrix, axis=1, keepdims=True)  # Normalize rows to unit Euclidean length

# Usage
documents = data['komentar']
vectorizer = CustomTfidfVectorizer()
vectorizer.fit(documents)
tfidf_matrix = vectorizer.transform(documents)
print(tfidf_matrix)


[[0.91976759 0.39246348 0.         0.        ]
 [0.         0.39246348 0.91976759 0.        ]
 [0.         0.5557582  0.         0.83134399]
 [0.         0.5557582  0.         0.83134399]
 [0.         0.5557582  0.         0.83134399]]


In [5]:
min = np.min(tfidf_matrix)
max = np.max(tfidf_matrix)
print(min)
print(max)

0.0
0.9197675884009774


In [24]:
import numpy as np

class KNNClassifier:
    
    def __init__(self, k):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):

        y_pred = []
        for x in X_test:
            distances = np.sqrt(np.sum((self.X_train - x) ** 2))
            nearest_neighbors = np.argsort(distances)[:self.k]
            nearest_labes = self.y_train[nearest_neighbors]
            unique_labels, counts = np.unique(nearest_labes, return_counts=True)
            predicted_labels = unique_labels[np.argmax(counts)]
            y_pred.append(predicted_labels)

        return y_pred

In [25]:
def train_test_split(x, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    indices = np.random.permutation(len(x))
    split_index = int(len(x) * (1 - test_size))
    return x[indices[:split_index]], x[indices[split_index:]], y.iloc[indices[:split_index]], y.iloc[indices[split_index:]]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data['label'], test_size=0.2, random_state=42)

In [28]:
knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)
pred

['0']