In [1]:
import json
import string
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\Judan Syamsul
[nltk_data]     Hadad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load json file
f = open('intent/intent.json', 'r')
intent_json = json.load(f)

In [3]:
# Create list from json
input = []
intent = []

for i in range(len(intent_json['intents'])):
    for user_input in intent_json['intents'][i]['input']:
        input.append(user_input)
        intent.append(intent_json['intents'][i]['intent'])

In [4]:
# Create dataframe from json
df = pd.DataFrame({
    'input': input,
    'intent' : intent
    # 'response' : response
})
df.head()

Unnamed: 0,input,intent
0,halo,sapa
1,hei,sapa
2,hey,sapa
3,hello,sapa
4,hallo,sapa


In [5]:
# Define stemmer and stopword remover
factory = StemmerFactory()
stemmer = factory.create_stemmer()

factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()

In [6]:
slang = pd.read_csv('lexicon/slang ke semi baku.csv')

slang_replace = {}
for i, row in enumerate(slang['slang']):
    slang_replace[row] = slang['formal'].iloc[i]

In [7]:
baku = pd.read_csv('lexicon/slang ke baku.csv')

baku_replace = {}
for i, row in enumerate(baku['slang']):
    baku_replace[row] = baku['baku'].iloc[i]

In [8]:
# Create text cleaning function
def clean_text(text):
    new_text = []

    text = text.lower()

    for kata in text.split():
        if kata not in (slang_replace|baku_replace):
            new_text.append(kata)
        elif kata in baku_replace:
            new_text+=baku_replace[kata].split()
        elif kata in slang_replace:
            new_text+=slang_replace[kata].split()
    new_text = ' '.join(
        stemmer.stem(
            baku_replace.get(
                word,
                word
            )
        ) for word in new_text if word not in stopwords
    )

    new_text = new_text.translate(
        str.maketrans(
            '',
            '',
            string.punctuation
        )
    )
    
    return new_text

In [9]:
kalimat = 'Aku laper banget gatau maunya makan jeruk apa lagi meniru-niru... daah ngelamar'
clean_text(kalimat)

'aku lapar banget tahu mau makan jeruk apa tiru deh lamar'

In [10]:
df['clean_input'] = df['input'].apply(clean_text)
df['clean_input'].head()

0     halo
1      hei
2      hai
3    hello
4     halo
Name: clean_input, dtype: object

In [11]:
# Create corpus
words = set([
    word for word in df['clean_input'] for word in word_tokenize(word)
    ]
)

In [12]:
corpus_size = len(words)

In [13]:
# Calculate word length of each row
df['length'] = df['clean_input'].apply(word_tokenize).apply(len)

In [14]:
sequence_length = int(round(df['length'].max(),0))

In [15]:
corpus_size, sequence_length

(273, 8)

In [16]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_train = le.fit_transform(df['intent'])
y_train = to_categorical(y_train)

In [17]:
le.classes_

array(['biaya', 'daftar', 'dokumen', 'error upload', 'ktp', 'link sosmed',
       'lowongan', 'lupa password', 'nama', 'pas foto', 'pengantar',
       'penutup', 'qualification', 'responsibilities', 'salary', 'sapa',
       'sertifikat', 'skck', 'timeline', 'training', 'transkrip-ijazah'],
      dtype=object)

In [18]:
len(le.classes_)

21

In [19]:
textvect = tf.keras.layers.TextVectorization(
    max_tokens=corpus_size,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=sequence_length
)
textvect.adapt(df['clean_input'])

In [20]:
embedding = tf.keras.layers.Embedding(
    input_dim=corpus_size,
    output_dim=16,
    input_length=sequence_length,
    embeddings_initializer='uniform'
)

In [21]:
tes = 'saya mau daftar rekrutmen'
textvect(clean_text(tes))

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([20, 11,  7,  0,  0,  0,  0,  0], dtype=int64)>

In [22]:
embedding(textvect(clean_text(tes)))

<tf.Tensor: shape=(8, 16), dtype=float32, numpy=
array([[-0.03436317,  0.03928449,  0.02017006, -0.02457641,  0.01162897,
         0.03741631, -0.00135561,  0.03635332, -0.00161115, -0.04203521,
         0.01314846, -0.02541434, -0.01312237, -0.00419213, -0.02114015,
        -0.03910436],
       [ 0.01110385, -0.02799114,  0.01052301, -0.02551323,  0.02929792,
         0.04521887,  0.02336918,  0.0429708 , -0.01190245, -0.00029536,
        -0.04396411,  0.03737772, -0.00811651,  0.01045377,  0.03049422,
        -0.02706285],
       [-0.03663881, -0.04771641, -0.02090973, -0.03309351, -0.03991692,
        -0.02790138, -0.0454954 , -0.01770438, -0.03191983, -0.02283714,
        -0.01921368, -0.01275697,  0.00446593,  0.02818007, -0.03166701,
        -0.0173602 ],
       [-0.02534754, -0.0019244 , -0.0275056 ,  0.02017946,  0.04213474,
         0.00129261,  0.01603736, -0.01392318,  0.01804024, -0.00492246,
        -0.00227969, -0.0392528 , -0.0365636 ,  0.04154776, -0.03142873,
        -

In [23]:
# Create model
input = tf.keras.layers.Input(shape=(1,), dtype='string')
hidden_layer = textvect(input)
hidden_layer = embedding(hidden_layer)
hidden_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16))(hidden_layer)
output = tf.keras.layers.Dense(len(le.classes_), activation='softmax')(hidden_layer)
model = tf.keras.Model(inputs=input, outputs=output)

# Compile model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy']
)

In [24]:
hist = model.fit(df['clean_input'], y_train, epochs=100, verbose=0)
pd.DataFrame(hist.history).tail()

Unnamed: 0,loss,categorical_accuracy
95,0.02037,1.0
96,0.019977,1.0
97,0.019746,1.0
98,0.019094,1.0
99,0.018647,1.0


In [25]:
model.evaluate(df['clean_input'], y_train)



[0.018221931532025337, 1.0]

In [26]:
y_train_df = pd.DataFrame(y_train, columns=le.classes_)
y_train_df['intent'] = y_train_df.idxmax(axis=1)

model_pred = model.predict(df['clean_input'])
model_pred = pd.DataFrame(model_pred, columns=le.classes_)
model_pred['intent'] = model_pred.idxmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_train_df['intent'], model_pred['intent']))

                  precision    recall  f1-score   support

           biaya       1.00      1.00      1.00        27
          daftar       1.00      1.00      1.00        22
         dokumen       1.00      1.00      1.00        22
    error upload       1.00      1.00      1.00         9
             ktp       1.00      1.00      1.00        15
     link sosmed       1.00      1.00      1.00        10
        lowongan       1.00      1.00      1.00        12
   lupa password       1.00      1.00      1.00        21
            nama       1.00      1.00      1.00         8
        pas foto       1.00      1.00      1.00        15
       pengantar       1.00      1.00      1.00        10
         penutup       1.00      1.00      1.00        21
   qualification       1.00      1.00      1.00        16
responsibilities       1.00      1.00      1.00        21
          salary       1.00      1.00      1.00        35
            sapa       1.00      1.00      1.00        19
      sertifi

In [27]:
def bot_response(text):
    """Take text as function input then predict using model. Return response based on highest probability using numpy argmax    
    """
    text = clean_text(text)
    pred = model.predict([text])
    res = le.classes_[pred.argmax()]
    if textvect(text).numpy().max() > 1:
        for label_pred in intent_json['intents']:
            if label_pred['intent'] == res:
                response = label_pred['response']
    else:
        response = ['Maaf, saya tidak mengerti']
    
    dict_temp = []
    for i in range(len(pred[0])):
        temp = {le.classes_[i]: pred[0][i]}
        dict_temp.append(temp)
    print(dict_temp)
    print(le.classes_[pred.argmax()])
    return print(np.random.choice(response))

In [28]:
# import pickle
# pickle.dump(le, open('saved_model/encoder.pkl', 'wb'))

In [29]:
# pickle.dump({'config': textvect.get_config(),
#              'weights': textvect.get_weights()}
#             , open("saved_model/textvect.pkl", "wb"))

In [30]:
# model.save('saved_model/model', save_traces=True)



INFO:tensorflow:Assets written to: saved_model/model\assets


INFO:tensorflow:Assets written to: saved_model/model\assets


In [31]:
bot_response('laper pengen makan jeruk')

[{'biaya': 0.019276958}, {'daftar': 0.010527545}, {'dokumen': 0.00040877648}, {'error upload': 0.00030921155}, {'ktp': 5.707197e-05}, {'link sosmed': 0.00015870223}, {'lowongan': 2.9218145e-05}, {'lupa password': 0.5926877}, {'nama': 0.0008345293}, {'pas foto': 0.0025887717}, {'pengantar': 4.8192658e-05}, {'penutup': 0.0013467034}, {'qualification': 0.0025026738}, {'responsibilities': 0.0012988395}, {'salary': 8.8033805e-05}, {'sapa': 0.001593638}, {'sertifikat': 0.0962988}, {'skck': 0.0012394362}, {'timeline': 0.008503553}, {'training': 0.24781552}, {'transkrip-ijazah': 0.012386126}]
lupa password
Maaf, saya tidak mengerti


In [32]:
bot_response('pas foto hilang gimana ya')

[{'biaya': 0.00028879038}, {'daftar': 3.446055e-05}, {'dokumen': 0.0013081614}, {'error upload': 2.2282444e-05}, {'ktp': 0.0004544905}, {'link sosmed': 8.479538e-05}, {'lowongan': 3.4720672e-06}, {'lupa password': 0.00045636593}, {'nama': 0.00019217581}, {'pas foto': 0.9855884}, {'pengantar': 2.5535815e-05}, {'penutup': 0.0035906234}, {'qualification': 2.462237e-05}, {'responsibilities': 0.00014229809}, {'salary': 7.912264e-06}, {'sapa': 0.00042383056}, {'sertifikat': 0.0009924851}, {'skck': 0.004791357}, {'timeline': 3.6891788e-06}, {'training': 5.272255e-05}, {'transkrip-ijazah': 0.0015113606}]
pas foto
Pas foto merupakan dokumen wajib pendaftaran ya, Kak. Berikut ketentuan pas foto yang harus dipenuhi:
- Background berwarna biru dengan pakaian formal (kemeja dan/atau jas)
- Foto berukuran 3x4 dan berwarna
- Disarankan untuk menggunakan foto terbaru
- Upload foto di https://rekrutmen.fiktif.id/dokumen dengan ukuran file tidak lebih dari 1 MB dengan format file jpg/png/jpeg


ktp, pas foto, password masih ketuker2

In [33]:
le.classes_

array(['biaya', 'daftar', 'dokumen', 'error upload', 'ktp', 'link sosmed',
       'lowongan', 'lupa password', 'nama', 'pas foto', 'pengantar',
       'penutup', 'qualification', 'responsibilities', 'salary', 'sapa',
       'sertifikat', 'skck', 'timeline', 'training', 'transkrip-ijazah'],
      dtype=object)