In [261]:
import tensorflow as tf 
import pandas as pd 
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [262]:
# get the data

def first_data(): 
    num_class = 5
    questions = []
    classes = []
    with open("./content.json", encoding='utf-8') as f: 
        # load json file
        data = json.load(f)
        for type_class in range(num_class): 
            # get the question data
            questions.append(data['intents'][type_class]['input'])
            # iterate for label class
            for i in range(len(data['intents'][type_class]['input'])): 
                # get the class data
                classes.append(data['intents'][type_class]['tag']) 
    # return all question and classes
    return questions, classes 

def final_data(questionsm, tags):
    final_questions  = []
    for row in range(len(questionsm)):
        for column in range(len(questionsm[row])): 
            final_questions.append(questionsm[row][column])
    #  define dataframe
    df = pd.DataFrame()
    # change the previou questions to finalq questions value 
    df['question'] = final_questions
    #  change the previou tags to final tag value 
    df['tag'] = tags
    # return final dataframe
    return  df


def encode_label(df: pd.DataFrame, tags: pd.Series): 
    le = LabelEncoder()
    le.fit(np.unique(tags))
    df['tag'] = le.fit_transform(df['tag'])
    return df, le.classes_

def get_train_test_data(df: pd.DataFrame): 
    feature = np.array(df.drop('tag', axis =1))
    target = np.array(df['tag'])
    X_train, X_test, y_train, y_test =  train_test_split(feature, target, random_state = 42, test_size = 0.1, shuffle = True)
    return  X_train, X_test, y_train, y_test

def text_preprocesing(text): 
    #  define stemmer from sastrawi 
    factory = StemmerFactory()
    # stemmer 
    stemmer = factory.create_stemmer()
    for row in range(len(text)): 
        for column in range(len(text[row])):
            text[row][column] = text[row][column].lower()
            # delete alfanumeric
            text[row][column] = re.sub(r'\W', ' ', text[row][column])
            # delete number
            text[row][column] = re.sub(r'\d+', '', text[row][column])
            # delete excessive whitespace
            text[row][column] = re.sub(r'\s+', ' ', text[row][column])
            print(f'before text preprocessing : {text[row][column]}')
            text[row][column] = stemmer.stem(text[row][column])
            print(f'after text preprocessing : {text[row][column]}')
            print(f'======================================================')
    return text        
        

def create_model_and_train(x_train, y_train, x_test, y_test):
    max_features = 10000
    sequence_length = 250
    vectorize_layer = tf.keras.layers.TextVectorization(
        max_tokens=max_features,
        output_mode='int',
        output_sequence_length=sequence_length
    )
    # train vectorization layer
    vectorize_layer.adapt(x_train)
    model = tf.keras.Sequential([
        # tf.keras.layers.TextVectorization(max_tokens=max_features,output_mode='int',output_sequence_length=sequence_length), 
        vectorize_layer,
        tf.keras.layers.Embedding(max_features, 16),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.LSTM(units = 200, return_sequences=True), 
        tf.keras.layers.LSTM(units = 100), 
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(5, activation='softmax')
    ])
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer='adam',
        metrics=['accuracy']
    )

    model.fit(x_train, y_train,epochs = 200 , validation_data = (x_test, y_test))

# def predict()

In [263]:
get = text_preprocesing(X_test)

before text preprocessing : siapa anda
after text preprocessing : siapa anda
before text preprocessing : saya dalam kondisi yang tidak sehat hari ini
after text preprocessing : saya dalam kondisi yang tidak sehat hari ini
before text preprocessing : saya absen sakit bisa anda bantu saya
after text preprocessing : saya absen sakit bisa anda bantu saya
before text preprocessing : anda bisa bantu apa
after text preprocessing : anda bisa bantu apa
before text preprocessing : tolong ikan saya absen izin
after text preprocessing : tolong ikan saya absen izin


In [264]:
questions, classes = first_data()
final =  final_data(questions,  classes)

In [265]:
final.head(20)

Unnamed: 0,question,tag
0,hai,greeting
1,hello,greeting
2,hai,greeting
3,selamat pagi,greeting
4,selamat siang,greeting
5,selamat sore,greeting
6,selamat malam,greeting
7,pagi,greeting
8,siang,greeting
9,sore,greeting


In [266]:
df_final, class_encode = encode_label(final, final['tag'])
df_final.head(20)

Unnamed: 0,question,tag
0,hai,2
1,hello,2
2,hai,2
3,selamat pagi,2
4,selamat siang,2
5,selamat sore,2
6,selamat malam,2
7,pagi,2
8,siang,2
9,sore,2


In [267]:
class_encode

array(['general', 'goodbye', 'greeting', 'izin', 'sakit'], dtype=object)

In [268]:
X_train, X_test, y_train, y_test = get_train_test_data(df_final)

In [269]:
X_train = text_preprocesing(X_train)
X_test = text_preprocesing(X_test)


before text preprocessing : kenapa anda dibuat
after text preprocessing : kenapa anda buat
before text preprocessing : selamat siang
after text preprocessing : selamat siang
before text preprocessing : thanks
after text preprocessing : thanks
before text preprocessing : siang
after text preprocessing : siang
before text preprocessing : selamat pagi
after text preprocessing : selamat pagi
before text preprocessing : selamat malam
after text preprocessing : selamat malam
before text preprocessing : hari ini saya sakit jadi saya tidak ke kantor
after text preprocessing : hari ini saya sakit jadi saya tidak ke kantor
before text preprocessing : saya ada kepentingan lain hari ini
after text preprocessing : saya ada penting lain hari ini
before text preprocessing : terima kasih banyak
after text preprocessing : terima kasih banyak
before text preprocessing : goodbye
after text preprocessing : goodbye
before text preprocessing : thanks for the info
after text preprocessing : thanks for the in