In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings('ignore')

In [217]:
data = pd.read_csv("../data/keyword_jptiik.csv")

data_abs = data[["abstract"]]
data_abs.head()

Unnamed: 0,abstract
0,teknologi mixed reality akan membuat interaksi...
1,penjadwalan merupakan salah satu proses pentin...
2,untuk memenuhi kebutuhan masyarakat akan keter...
3,perusahaan mcdonald’s adalah perusahaan yang b...
4,perkembangan internet saat ini sangat pesat. d...


In [218]:
data_abs["abstract"] = data_abs["abstract"].str.strip()
data_abs["abstract"] = data_abs["abstract"].str.lower()
data_abs["abstract"] = data_abs["abstract"].str.replace('[^\w\s]','')
data_abs["abstract"] = data_abs["abstract"].str.replace('\d+', '')
data_abs["abstract"] = data_abs["abstract"].str.replace('\s+', ' ')
data_abs["abstract"] = data_abs["abstract"].str.replace('\n', ' ')
data_abs["abstract"] = data_abs["abstract"].str.replace('\t', ' ')
data_abs["abstract"] = data_abs["abstract"].str.replace('  ', ' ')

In [219]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [220]:
stopword = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

data_abs["abstract"] = data_abs["abstract"].apply(lambda x: stopword.remove(x))
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: stemmer.stem(x))

In [221]:
import random

In [222]:
vocab = set()
max_tokens = 10000

for abstract in data_abs["abstract"]:
    for word in abstract.split():
        vocab.add(word)
    
vocab = list(vocab)
random.shuffle(vocab)
vocab = vocab[:max_tokens]
vocab += ["<UNK>"]

data_abs["abstract"] = data_abs["abstract"].apply(lambda x: x.split())
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: [word if word in vocab else "<UNK>" for word in x])
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: " ".join(x))

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}

In [226]:
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: [word2idx[word] for word in x.split()])

In [228]:
# mean length of abstract
MAX_LENGTH = round(sum([len(abstract) for abstract in data_abs["abstract"]]) / len(data_abs["abstract"]))

# padding and truncate
data_abs["abstract"] = data_abs["abstract"].apply(lambda x: x + [0] * (MAX_LENGTH - len(x)) if len(x) < MAX_LENGTH else x[:MAX_LENGTH])
# data_abs["abstract"] = data_abs["abstract"].apply(lambda x: torch.tensor(x).float())

In [229]:
VOCAB_SIZE = len(vocab)

In [264]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, 512)
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.tanh = nn.Tanh()
        self.gelu = nn.GELU()
        self.relu = nn.ReLU()

    
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.gelu(x)
        x = self.fc2(x)
        x = self.gelu(x)
        x = self.fc3(x)
        x = self.relu(x)

        return x
    
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc3 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc1 = nn.Linear(256, 512)
        self.embedding = nn.Embedding(512, VOCAB_SIZE)
        self.tanh = nn.Tanh()
        self.selu = nn.SELU()
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = torch.tensor(x).long()
        x = self.embedding(x)
        

        return x

class AutoEncoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [280]:
E = Encoder()
D = Decoder()

forward = E(torch.tensor(data_abs["abstract"][0]).type(torch.LongTensor))
D(forward)

tensor([[[ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         ...,
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446]],

        [[ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         ...,
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446]],

        [[ 1.6013,  0.6687, -1.2794,  ..., -1.6663, -1.9133,  0.1446],
         [ 1.6013,  0.6687, -1.2794,  ..., -1

In [266]:
E = Encoder()
D = Decoder()
AE = AutoEncoder(E, D)
forward = AE(torch.tensor(data_abs["abstract"][0]).type(torch.LongTensor))
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(AE.parameters(), lr=1e-3)

criterion(forward, torch.tensor(data_abs["abstract"][0]).long())

RuntimeError: The size of tensor a (10001) must match the size of tensor b (109) at non-singleton dimension 2

In [245]:
E = Encoder(109, 64, 32, 16)
D = Decoder(109, 64, 32, 16)
AE = AutoEncoder(E, D)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(AE.parameters(), lr=0.01)

for epoch in range(10):
    for abstract in data_abs["abstract"]:
    # abstract = torch.tensor(data_abs["abstract"][0], dtype=torch.float32)
        abstract = torch.tensor(abstract, dtype=torch.float32)
        optimizer.zero_grad()
        output = AE(abstract)
        loss = torch.sqrt(criterion(output, abstract))
        loss.backward()
        optimizer.step()
    print(loss)

tensor(6918.1338, grad_fn=<SqrtBackward0>)
tensor(6918.1338, grad_fn=<SqrtBackward0>)


KeyboardInterrupt: 