In [341]:
import json
import string

import numpy as np
import pandas as pd
import tensorflow as tf

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [342]:
# Load json file
f = open('intent/intent.json', 'r')
intent_json = json.load(f)

In [343]:
# Create list from json
input = []
intent = []

for i in range(len(intent_json['intents'])):
    for user_input in intent_json['intents'][i]['input']:
        input.append(user_input)
        intent.append(intent_json['intents'][i]['intent'])

In [344]:
# Create dataframe from json
df = pd.DataFrame({
    'input': input,
    'intent' : intent
    # 'response' : response
})
df.head()

Unnamed: 0,input,intent
0,dimana cari skck?,skck
1,bagaimana cara buat skck?,skck
2,apa itu skck?,skck
3,apakah skck saya masih berlaku?,skck
4,berapa lama skck saya bisa digunakan?,skck


In [345]:
# Define stemmer and stopword remover
factory = StemmerFactory()
stemmer = factory.create_stemmer()

factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()

In [346]:
slang = pd.read_csv('lexicon/slang ke semi baku.csv')

slang_replace = {}
for i, row in enumerate(slang['slang']):
    slang_replace[row] = slang['formal'].iloc[i]

In [347]:
baku = pd.read_csv('lexicon/slang ke baku.csv')

baku_replace = {}
for i, row in enumerate(baku['slang']):
    baku_replace[row] = baku['baku'].iloc[i]

In [348]:
# Create text cleaning function
def clean_text(text):
    new_text = []

    text = text.lower()

    for kata in text.split():
        if kata not in (slang_replace|baku_replace):
            new_text.append(kata)
        elif kata in baku_replace:
            new_text+=baku_replace[kata].split()
        elif kata in slang_replace:
            new_text+=slang_replace[kata].split()
    new_text = ' '.join(
        stemmer.stem(
            baku_replace.get(
                word,
                word
            )
        ) for word in new_text if word not in stopwords
    )

    new_text = new_text.translate(
        str.maketrans(
            '',
            '',
            string.punctuation
        )
    )
    
    return new_text

In [349]:
kalimat = 'Aku laper banget gatau maunya makan jeruk apa lagi meniru-niru... daah ngelamar'
clean_text(kalimat)

'aku lapar banget tahu mau makan jeruk apa tiru deh lamar'

In [350]:
df['clean_input'] = df['input'].apply(clean_text)
df['clean_input'].head()

0                   cari skck
1    bagaimana cara buat skck
2                    apa skck
3                   skck laku
4       berapa lama skck guna
Name: clean_input, dtype: object

In [351]:
# Create corpus
from nltk.tokenize import word_tokenize
words = set([
    word for word in df['clean_input'] for word in word_tokenize(word)
    ]
)

In [352]:
corpus_size = len(words)

In [353]:
# Calculate word length of each row
df['length'] = df['clean_input'].apply(word_tokenize).apply(len)

In [354]:
sequence_length = int(round(df['length'].max(),0))

In [355]:
corpus_size, sequence_length

(248, 8)

In [356]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_train = le.fit_transform(df['intent'])
y_train = to_categorical(y_train)

In [357]:
le.classes_

array(['biaya', 'daftar', 'dokumen', 'error upload', 'ktp', 'link sosmed',
       'lowongan', 'lupa password', 'nama_bot', 'pas foto', 'pengantar',
       'penutup', 'qualification', 'responsibilities-DS', 'salary',
       'sertifikat', 'skck', 'timeline', 'training', 'transkrip-ijazah'],
      dtype=object)

In [358]:
len(le.classes_)

20

In [359]:
textvect = tf.keras.layers.TextVectorization(
    max_tokens=corpus_size,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=sequence_length
)
textvect.adapt(df['clean_input'])

In [360]:
embedding = tf.keras.layers.Embedding(
    input_dim=corpus_size,
    output_dim=16,
    input_length=sequence_length,
    embeddings_initializer='uniform'
)

In [361]:
tes = 'saya mau daftar rekrutmen'
textvect(clean_text(tes))

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([23, 12,  9,  0,  0,  0,  0,  0], dtype=int64)>

In [362]:
embedding(textvect(clean_text(tes)))

<tf.Tensor: shape=(8, 16), dtype=float32, numpy=
array([[ 0.04846788,  0.02628523, -0.00433416, -0.01324021,  0.01427785,
        -0.0045706 ,  0.00649945,  0.02057404, -0.04407346,  0.01841583,
         0.00017424, -0.00500776,  0.01837561, -0.00908165, -0.03814398,
         0.04713846],
       [ 0.00986294, -0.0269418 , -0.04586191,  0.04803355,  0.04992703,
         0.00619525,  0.03795925,  0.03714288, -0.01629553,  0.02620966,
        -0.03784236, -0.00958794,  0.02586884,  0.0065267 , -0.02424387,
         0.03395048],
       [ 0.02879191, -0.03145681, -0.00020306, -0.01343238,  0.03986249,
        -0.00692382,  0.00958387,  0.04946334, -0.03756637,  0.00265688,
        -0.03132159,  0.04292189,  0.03831853, -0.01791401, -0.01970152,
        -0.02132745],
       [ 0.02458557, -0.04589987, -0.01925696, -0.04408726,  0.02650769,
        -0.01396913, -0.03519198,  0.02135784,  0.02408452,  0.00863268,
        -0.0016515 ,  0.03531349,  0.00786199, -0.04921626, -0.01357733,
         

In [363]:
# Create model
input = tf.keras.layers.Input(shape=(1,), dtype='string')
hidden_layer = textvect(input)
hidden_layer = embedding(hidden_layer)
hidden_layer = tf.keras.layers.LSTM(16)(hidden_layer)
output = tf.keras.layers.Dense(len(le.classes_), activation='softmax')(hidden_layer)
model = tf.keras.Model(inputs=input, outputs=output)

# Compile model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy']
)

In [364]:
hist = model.fit(df['clean_input'], y_train, epochs=100, verbose=0)
pd.DataFrame(hist.history).tail()

Unnamed: 0,loss,categorical_accuracy
95,0.104281,1.0
96,0.102008,1.0
97,0.099924,1.0
98,0.097772,1.0
99,0.095782,1.0


In [365]:
model.evaluate(df['clean_input'], y_train)



[0.09455347806215286, 1.0]

In [366]:
y_train_df = pd.DataFrame(y_train, columns=le.classes_)
y_train_df['intent'] = y_train_df.idxmax(axis=1)

model_pred = model.predict(df['clean_input'])
model_pred = pd.DataFrame(model_pred, columns=le.classes_)
model_pred['intent'] = model_pred.idxmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_train_df['intent'], model_pred['intent']))

                     precision    recall  f1-score   support

              biaya       1.00      1.00      1.00        27
             daftar       1.00      1.00      1.00        22
            dokumen       1.00      1.00      1.00        22
       error upload       1.00      1.00      1.00         9
                ktp       1.00      1.00      1.00        15
        link sosmed       1.00      1.00      1.00        10
           lowongan       1.00      1.00      1.00        12
      lupa password       1.00      1.00      1.00        21
           nama_bot       1.00      1.00      1.00         8
           pas foto       1.00      1.00      1.00        15
          pengantar       1.00      1.00      1.00        10
            penutup       1.00      1.00      1.00        21
      qualification       1.00      1.00      1.00        17
responsibilities-DS       1.00      1.00      1.00        24
             salary       1.00      1.00      1.00        35
         sertifikat    

In [373]:
def bot_response(text):
    """Take text as function input then predict using model. Return response based on highest probability using numpy argmax    
    """
    text = clean_text(text)
    pred = model.predict([text])
    res = le.classes_[pred.argmax()]
    if textvect(text).numpy().max() > 1:
        for label_pred in intent_json['intents']:
            if label_pred['intent'] == res:
                response = label_pred['response']
    else:
        response = ['Maaf, saya tidak mengerti']
    
    dict_temp = []
    for i in range(len(pred[0])):
        temp = {le.classes_[i]: pred[0][i]}
        dict_temp.append(temp)
    print(dict_temp)
    print(le.classes_[pred.argmax()])
    return print(np.random.choice(response))

In [368]:
# import pickle
# pickle.dump(le, open('encoder.pkl', 'wb'))

In [369]:
# tf.keras.models.save_model(model, 'model')

In [370]:
bot_response('laper pengen makan jeruk')

[{'biaya': 7.790398e-05}, {'daftar': 0.011751859}, {'dokumen': 0.003143824}, {'error upload': 0.0014080789}, {'ktp': 7.284601e-07}, {'link sosmed': 0.00070906815}, {'lowongan': 0.014835751}, {'lupa password': 0.00072696636}, {'nama_bot': 6.016228e-05}, {'pas foto': 4.7655867e-05}, {'pengantar': 5.7353565e-05}, {'penutup': 0.0005667135}, {'qualification': 0.0008448163}, {'responsibilities-DS': 0.017472424}, {'salary': 0.0062029967}, {'sertifikat': 0.91824764}, {'skck': 0.00037973726}, {'timeline': 0.022906782}, {'training': 0.0005183554}, {'transkrip-ijazah': 4.112171e-05}]
15
sertifikat
Maaf, saya tidak mengerti


In [371]:
bot_response('pas foto hilang gimana ya')

[{'biaya': 0.0018515236}, {'daftar': 0.00040320226}, {'dokumen': 4.5594323e-05}, {'error upload': 0.02319791}, {'ktp': 0.012349861}, {'link sosmed': 0.029984482}, {'lowongan': 0.00340934}, {'lupa password': 4.912934e-06}, {'nama_bot': 0.00013604735}, {'pas foto': 0.9051894}, {'pengantar': 0.015934302}, {'penutup': 0.0002913658}, {'qualification': 0.0008499332}, {'responsibilities-DS': 0.0015388916}, {'salary': 1.977861e-05}, {'sertifikat': 6.785532e-05}, {'skck': 8.483039e-05}, {'timeline': 0.000546083}, {'training': 0.0027302864}, {'transkrip-ijazah': 0.0013644788}]
9
pas foto
Pas foto merupakan dokumen wajib pendaftaran. Berikut ketentuan pas foto yang harus dipenuhi ya, Kak:
- Background berwarna biru dengan pakaian formal (kemeja dan/atau jas)
- Foto berukuran 3x4 dan berwarna
- Disarankan untuk menggunakan foto terbaru
- Upload foto di https://rekrutmen.fiktif.id/dokumen dengan ukuran file tidak lebih dari 1 MB dengan format file jpg/png/jpeg


ktp, pas foto, password masih ketuker2

In [372]:
le.classes_

array(['biaya', 'daftar', 'dokumen', 'error upload', 'ktp', 'link sosmed',
       'lowongan', 'lupa password', 'nama_bot', 'pas foto', 'pengantar',
       'penutup', 'qualification', 'responsibilities-DS', 'salary',
       'sertifikat', 'skck', 'timeline', 'training', 'transkrip-ijazah'],
      dtype=object)