In [193]:
import tensorflow as tf 
import pandas as pd 
import json
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.utils import pad_sequences

In [194]:
# get the data

def first_data(): 
    num_class = 5
    questions = []
    classes = []
    with open("./content.json", encoding='utf-8') as f: 
        # load json file
        data = json.load(f)
        for type_class in range(num_class): 
            # get the question data
            questions.append(data['intents'][type_class]['input'])
            # iterate for label class
            for i in range(len(data['intents'][type_class]['input'])): 
                # get the class data
                classes.append(data['intents'][type_class]['tag']) 
    # return all question and classes
    return questions, classes 

def final_data(questionsm, tags):
    final_questions  = []
    for row in range(len(questionsm)):
        for column in range(len(questionsm[row])): 
            final_questions.append(questionsm[row][column])
    #  define dataframe
    df = pd.DataFrame()
    # change the previou questions to finalq questions value 
    df['question'] = final_questions
    #  change the previou tags to final tag value 
    df['tag'] = tags
    # return final dataframe
    return  df


def encode_label(df: pd.DataFrame, tags: pd.Series): 
    le = LabelEncoder()
    le.fit(np.unique(tags))
    df['tag'] = le.fit_transform(df['tag'])
    return df, le.classes_

def get_train_test_data(df: pd.DataFrame): 
    # drp the tag colunns
    feature = np.array(df.drop('tag', axis =1))
    target = np.array(df['tag'])

    # the test data will save as xlsx data
    X_train, y_train = np.array(feature), np.array(target)
    return  X_train, y_train

def text_preprocesing(text): 
    #  define stemmer from sastrawi 
    factory = StemmerFactory()
    # stemmer 
    stemmer = factory.create_stemmer()
    for row in range(len(text)): 
        for column in range(len(text[row])):
            text[row][column] = text[row][column].lower()
            # delete alfanumeric
            text[row][column] = re.sub(r'\W', ' ', text[row][column])
            # delete number
            text[row][column] = re.sub(r'\d+', '', text[row][column])
            # delete excessive whitespace
            text[row][column] = re.sub(r'\s+', ' ', text[row][column])
            text[row][column] = stemmer.stem(text[row][column])

    return text        
        

def create_model_and_train(x_train, y_train):
    # dofine callbacks
    class CustomCallbaks(tf.keras.callbacks.Callback): 
        def on_epoch_end(self, epoch, logs={}):
            if(logs.get('accuracy')>0.85):
                print("\nTrain was stopped")
                self.model.stop_training = True

    # trying tokenize 
    x_train_list = x_train.tolist()
    # define tokenizer
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000, oov_token= "<OOV>")
    tokenizer.fit_on_texts(x_train_list)

    x_train_sequence = tokenizer.texts_to_sequences(x_train_list)
    x_train_padded = pad_sequences(x_train_sequence)
    inputs = tf.keras.Input(shape=(None, ))
    x = tf.keras.layers.Embedding(1000, 2)(inputs)
    x = tf.keras.layers.LSTM(units = 200)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    predictions = tf.keras.layers.Dense(5, activation="softmax", name="predictions")(x)
    model = tf.keras.Model(inputs, predictions)
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer='adam',
        metrics=['accuracy']
    )

    history = model.fit(x_train_padded, y_train,epochs = 200, callbacks= CustomCallbaks())
    return model, history, tokenizer

def predict_class(text: str, tokenizer, model): 
    factory = StemmerFactory()
    # stemmer 
    stemmer = factory.create_stemmer()
    text= text.lower()
    text= re.sub(r'\W', ' ', text)
            # delete number
    text= re.sub(r'\d+', '', text)
            # delete excessive whitespace
    text= re.sub(r'\s+', ' ', text)
    final_text= stemmer.stem(text)
    # final_text = text_preprocesing(text)
    print(final_text)
    text_tokeneize = tokenizer.texts_to_sequences([final_text])
    text_tokeneize = pad_sequences(text_tokeneize)
    pred = model.predict(text_tokeneize)
    return pred


In [195]:
questions, classes = first_data()
final =  final_data(questions,  classes)




In [196]:
final.head(10)

Unnamed: 0,question,tag
0,Hai,greeting
1,Halo,greeting
2,Hei,greeting
3,Assalamualaikum,greeting
4,Hi,greeting
5,Apa kabar,greeting
6,Yo,greeting
7,Wassup,greeting
8,Hey,greeting
9,Selamat datang,greeting


In [197]:
df_final, class_encode = encode_label(final, final['tag'])
df_final.head(20)

Unnamed: 0,question,tag
0,Hai,2
1,Halo,2
2,Hei,2
3,Assalamualaikum,2
4,Hi,2
5,Apa kabar,2
6,Yo,2
7,Wassup,2
8,Hey,2
9,Selamat datang,2


In [198]:
class_encode.shape

(5,)

In [199]:
class_encode

array(['general', 'goodbye', 'greeting', 'izin', 'sakit'], dtype=object)

In [200]:
X_train, y_train = get_train_test_data(df_final)
# the x train len is : 250

In [201]:
X_train = text_preprocesing(X_train)
# X_test = text_preprocesing(X_test)


In [202]:
model, history, tokenizer  = create_model_and_train(X_train, y_train)
# create_model_and_train(df=df_final)

Epoch 1/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1698 - loss: 1.6097
Epoch 2/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2282 - loss: 1.6095 
Epoch 3/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2103 - loss: 1.6091 
Epoch 4/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2154 - loss: 1.6087 
Epoch 5/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1945 - loss: 1.6082 
Epoch 6/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2474 - loss: 1.6068 
Epoch 7/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2862 - loss: 1.6031 
Epoch 8/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2950 - loss: 1.5986 
Epoch 9/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [208]:
pred = predict_class("saya izin tidak ke kantor", tokenizer, model)
# class_encode[np.argmax(pred)]
np.argmax(pred)


saya izin tidak ke kantor
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


3

In [209]:
class_encode

array(['general', 'goodbye', 'greeting', 'izin', 'sakit'], dtype=object)