In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings('ignore')

d:\Anaconda\envs\DataScience\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
d:\Anaconda\envs\DataScience\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
data = pd.read_csv("../data/keyword_jptiik.csv")

data_abs = data[["abstract"]]
data_abs.head()

Unnamed: 0,abstract
0,teknologi mixed reality akan membuat interaksi...
1,penjadwalan merupakan salah satu proses pentin...
2,untuk memenuhi kebutuhan masyarakat akan keter...
3,perusahaan mcdonald’s adalah perusahaan yang b...
4,perkembangan internet saat ini sangat pesat. d...


In [3]:
data_abs["abstract"] = data_abs["abstract"].str.strip()
data_abs["abstract"] = data_abs["abstract"].str.lower()
data_abs["abstract"] = data_abs["abstract"].str.replace('[^\w\s]','')
data_abs["abstract"] = data_abs["abstract"].str.replace('\d+', '')
data_abs["abstract"] = data_abs["abstract"].str.replace('\s+', ' ')
data_abs["abstract"] = data_abs["abstract"].str.replace('\n', ' ')
data_abs["abstract"] = data_abs["abstract"].str.replace('\t', ' ')
data_abs["abstract"] = data_abs["abstract"].str.replace('  ', ' ')

In [4]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [5]:
stopword = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

data_abs["abstract"] = data_abs["abstract"].apply(lambda x: stopword.remove(x))
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: stemmer.stem(x))

In [6]:
import random

In [7]:
vocab = set()
max_tokens = 10000

for abstract in data_abs["abstract"]:
    for word in abstract.split():
        vocab.add(word)
    
vocab = list(vocab)
random.shuffle(vocab)
vocab = vocab[:max_tokens]
vocab += ["<UNK>"]

data_abs["abstract"] = data_abs["abstract"].apply(lambda x: x.split())
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: [word if word in vocab else "<UNK>" for word in x])
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: " ".join(x))

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}

In [8]:
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: [word2idx[word] for word in x.split()])

In [9]:
# mean length of abstract
MAX_LENGTH = round(sum([len(abstract) for abstract in data_abs["abstract"]]) / len(data_abs["abstract"]))

# padding and truncate
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: x + [0] * (MAX_LENGTH - len(x)) if len(x) < MAX_LENGTH else x[:MAX_LENGTH])
# data_abs["abstract"] = data_abs["abstract"].apply(lambda x: torch.tensor(x).float())

In [15]:
data_abs = data_abs["abstract"].apply(pd.Series)
data_abs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
0,10000,10000,6803,3289,5772,10000,10000,1600,10000,3668,...,0,0,0,0,0,0,0,0,0,0
1,7441,10000,10000,4519,5969,10000,4519,4562,7441,4519,...,0,0,0,0,0,0,0,0,0,0
2,7144,381,8923,3928,10000,10000,10000,4725,8561,7843,...,0,0,0,0,0,0,0,0,0,0
3,1951,6417,1951,4032,10000,10000,10000,10000,10000,6417,...,10000,10000,6183,10000,583,6417,7058,6510,8820,10000
4,10000,9254,7180,8799,4190,6192,10000,10000,10000,10000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,10000,7567,8107,7059,955,10000,176,1951,7440,7567,...,1420,4562,6545,8135,8513,2344,10000,5975,3360,10000
3475,10000,10000,7363,1675,5070,10000,10000,4402,4593,10000,...,10000,10000,10000,10000,10000,5772,5040,4556,10000,10000
3476,9656,2374,6377,5085,7352,10000,421,1548,8527,2374,...,57,10000,7466,6671,10000,9703,10000,4808,5101,8924
3477,7630,5022,7069,8452,10000,10000,3680,8452,10000,1305,...,1273,10000,10000,10000,10000,10000,10000,10000,10000,8135


In [16]:
data_abs.shape

(3479, 109)

In [17]:
VOCAB_SIZE = len(vocab)

In [111]:
data_abs = data_abs.to_numpy()
data_abs = torch.tensor(data_abs).float()
data_abs

tensor([[10000., 10000.,  6803.,  ...,     0.,     0.,     0.],
        [ 7441., 10000., 10000.,  ...,     0.,     0.,     0.],
        [ 7144.,   381.,  8923.,  ...,     0.,     0.,     0.],
        ...,
        [ 9656.,  2374.,  6377.,  ...,  4808.,  5101.,  8924.],
        [ 7630.,  5022.,  7069.,  ..., 10000., 10000.,  8135.],
        [ 1345.,  5375., 10000.,  ...,     0.,     0.,     0.]])

In [120]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(109, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.tanh = nn.Tanh()
        self.gelu = nn.GELU()
        self.relu = nn.ReLU()

    
    def forward(self, x):
        x = self.fc1(x)
        x = self.gelu(x)
        x = self.fc2(x)
        x = self.gelu(x)
        x = self.fc3(x)
        x = self.gelu(x)

        return x
    
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc3 = nn.Linear(16, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc1 = nn.Linear(64, 109)
        self.tanh = nn.Tanh()
        self.gelu = nn.GELU()
        self.selu = nn.SELU()
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc3(x)
        x = self.gelu(x)
        x = self.fc2(x)
        x = self.gelu(x)
        x = self.fc1(x)
        x = self.gelu(x)

        return x

class AutoEncoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [123]:
E = Encoder()
D = Decoder()
AE = AutoEncoder(E, D)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(AE.parameters(), lr=0.1)

for epoch in range(1000):
    
    abstract = data_abs
    optimizer.zero_grad()
    output = AE(abstract)
    loss = torch.sqrt(criterion(output, abstract))
    loss.backward()
    optimizer.step()
    print(loss)

tensor(7335.1680, grad_fn=<SqrtBackward0>)
tensor(6303807., grad_fn=<SqrtBackward0>)
tensor(8056.9053, grad_fn=<SqrtBackward0>)
tensor(7359.2202, grad_fn=<SqrtBackward0>)
tensor(7358.9541, grad_fn=<SqrtBackward0>)
tensor(7357.8853, grad_fn=<SqrtBackward0>)
tensor(7355.7061, grad_fn=<SqrtBackward0>)
tensor(7352.9712, grad_fn=<SqrtBackward0>)
tensor(7348.2197, grad_fn=<SqrtBackward0>)
tensor(7340.8696, grad_fn=<SqrtBackward0>)
tensor(7329.9800, grad_fn=<SqrtBackward0>)
tensor(7311.1743, grad_fn=<SqrtBackward0>)
tensor(7281.4663, grad_fn=<SqrtBackward0>)
tensor(7224.0073, grad_fn=<SqrtBackward0>)
tensor(7125.2974, grad_fn=<SqrtBackward0>)
tensor(6976.5171, grad_fn=<SqrtBackward0>)
tensor(6769.3794, grad_fn=<SqrtBackward0>)
tensor(6428.5088, grad_fn=<SqrtBackward0>)
tensor(5979.9312, grad_fn=<SqrtBackward0>)
tensor(5371.8550, grad_fn=<SqrtBackward0>)
tensor(4624.1274, grad_fn=<SqrtBackward0>)
tensor(3898.9507, grad_fn=<SqrtBackward0>)
tensor(4143.3037, grad_fn=<SqrtBackward0>)
tensor(4511.

In [124]:
latent_representation = AE.encoder(data_abs)
latent_representation.shape

torch.Size([3479, 16])

In [150]:
# to dataframe
latent_df = pd.DataFrame(latent_representation.detach().numpy())
latent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
1,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
2,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
3,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
4,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
3475,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
3476,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894
3477,-0.056497,0.868776,6.451501,3.984873,7.360752,2.281356,0.415752,0.489278,0.934034,7.427495,2.789972,2.209615,5.957324,-0.169887,2.36208,4.092894


In [157]:
JUMLAH_PEMINATAN = 6
k_means = KMeans().fit(latent_representation.detach().numpy())
labels = k_means.labels_

In [158]:
data["labels"] = labels

In [159]:
data["labels"].value_counts()

3    3400
0      56
1      23
Name: labels, dtype: int64