In [1]:
print(0)

0


In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter

In [88]:
class Config:
    DATA_PATH = Path("./data/imdb_indonesian_movies_2.csv")
    PICKLE_PATH =  Path("./pickle")
    FEATURES = ["judul_film", "ringkasan_sinopsis"]
    LABEL = "genre"
    
class Metric:
    def accuracy(self, y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy
    
    def precision(self, y_true, y_pred):
        tp = np.sum(y_true * y_pred)
        fp = np.sum((1 - y_true) * y_pred)
        return tp / (tp + fp)
    
    def recall(self, y_true, y_pred):
        tp = np.sum(y_true * y_pred)
        fn = np.sum(y_true * (1 - y_pred))
        return tp / (tp + fn)
    
    def f1_score(self, y_true, y_pred):
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        return 2 * precision * recall / (precision + recall)
    
    def classification_metrics(self, y_true, y_pred):
        accuracy = self.accuracy(y_true, y_pred)
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        f1_score = self.f1_score(y_true, y_pred)
        return accuracy, precision, recall, f1_score


class KFold:
    def __init__(self, n_splits=5, shuffle=True):
        self.n_splits = n_splits
        self.shuffle = shuffle
    
    def split(self, X):
        n = len(X)
        idx = np.arange(n)
        
        if self.shuffle:
            np.random.shuffle(idx)
        
        for i in range(self.n_splits):
            start = i * n // self.n_splits
            end = (i + 1) * n // self.n_splits
            val_idx = idx[start:end]
            train_idx = np.delete(idx, val_idx)
            yield train_idx, val_idx

class MKNNClassifier:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def _S(self, a, b):
        return 1 if a == b else 0
    
    def _validity_score(self, i):
        distances = np.array([self._euclidean_distance(self.X_train[i], x_train) for x_train in self.X_train])
        k_indices = np.argsort(distances)[:self.k]
        return 1 / self.k * np.sum([self._S(self.y_train[i], self.y_train[k]) for k in k_indices])
        
    def _predict(self, x_test):
        distances = np.array([self._euclidean_distance(x_test, x_train) for x_train in self.X_train])
        k_indices = np.argsort(distances)[:self.k]
        validity_scores = [self._validity_score(i) for i in k_indices]
        k_labels=  [self.y_train[i] for i in k_indices]

        W = [
            validity_scores[i] / (distances[k_indices[i]] + 0.0001) for i in range(len(k_indices))
        ]

        sum_w = Counter()
        for i in range(len(k_indices)):
            sum_w[k_labels[i]] += W[i]
        
        return sum_w.most_common(1)[0][0]
        
    def predict(self, X_test):
        return np.array([self._predict(x_test) for x_test in X_test])
    
class Tfidfvectorizer:
    def __init__(self, min_df=1):
        self.min_df = min_df
    
    def preprocess(self, docs):
        self.word_set = []
        self.kalimats = []
        
        for kalimat in docs:
            kalimat = [kata.lower() for kata in kalimat.split()]
            self.kalimats.append(kalimat)
            for kata in kalimat:
                if kata not in self.word_set:
                    self.word_set.append(kata)
        
        self.word_set = set(self.word_set)
        self.n_docs = len(self.kalimats)
        self.index_dict = {kata: i for i, kata in enumerate(self.word_set)}
        
        self.word_count = {}
        for kata in self.word_set:
            self.word_count[kata] = 0
            for kalimat in self.kalimats:
                if kata in kalimat:
                    self.word_count[kata] += 1


    def tf(self,docs, kata):
        n = len(docs)
        cocok = len([k for k in docs if k == kata])
        return cocok / n
    
    def idf(self,kata):
        try:
            kata_cocok = self.word_count[kata] + 1
        except:
            kata_cocok = 1
        
        return np.log(self.n_docs / kata_cocok)
    
    def tf_idf(self, kalimat):
        vec = np.zeros((len(self.word_set),))
        for kata in kalimat:
            tf = self.tf(kalimat, kata)
            idf = self.idf(kata)
            
            vec[self.index_dict[kata]] = tf * idf
        
        return vec

    def fit_transform(self, docs):
        self.preprocess(docs)
        vecs = []
        for kalimat in self.kalimats:
            vecs.append(self.tf_idf(kalimat))
        
        return np.array(vecs)

In [89]:
data = pd.read_csv(Config.DATA_PATH)
data.head()

Unnamed: 0,judul_film,ringkasan_sinopsis,genre
0,Sunan Kalijaga,Raden Mas Said putra sulung Tumenggung Wilarik...,Drama
1,Gie,Soe Hok Gie adalah seorang aktivis yang hidup ...,Drama
2,Guru Bangsa Tjokroaminoto,Guru Bangsa Tjokroaminoto menceritakan tentang...,Drama
3,POL Movie,POL menceritakan kisah hidup yang luar biasa d...,Drama
4,Sang pencerah,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,Drama


In [90]:
kf = KFold(n_splits=5)

for train_idx, test_idx in kf.split(data[Config.FEATURES]):
    print(len(train_idx), len(test_idx))
    # print(data[Config.LABEL][train_idx].value_counts())
    break

804 201


In [92]:
t = Tfidfvectorizer()

X = t.fit_transform(data["ringkasan_sinopsis"])
y = data[Config.LABEL]

In [None]:
kf = KFold(n_splits=5)
metrics = Metric()

for idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    mkkn = MKNNClassifier(k=5)
    mkkn.fit(X_train, y_train)
    
    y_pred = mkkn.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred)
    
    print(f"fold {idx + 1} f1_score: {f1_score}")
    

In [120]:
# sklearn tfidf

from sklearn.feature_extraction.text import TfidfVectorizer

t = TfidfVectorizer()

X = t.fit_transform(data["ringkasan_sinopsis"])

X = np.array(X.todense())

In [122]:
mknn = MKNNClassifier(k=5)
mknn.fit(pd.DataFrame(X[train_idx]), y_train)

y_pred = mknn.predict(pd.DataFrame(X[test_idx][:5]))
# f1_score = metrics.f1_score(y_test, y_pred)

  validity_scores[i] / (distances[k_indices[i]]) for i in range(len(k_indices))


KeyError: 8

In [None]:
f1_score = metrics.f1_score(y_test, y_pred)
f1_score