# PraPemrosesan Data

In [29]:
import pandas as pd
import numpy as np
import json
import nltk
from nltk.stem import WordNetLemmatizer
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
# Importing the dataset
with open('kampus_merdeka.json') as content:
  data1 = json.load(content)

# Mendapatkan semua data ke dalam list
tags = [] # data tag
inputs = [] # data input atau pattern
responses = {} # data respon
words = [] # Data kata
classes = [] # Data Kelas atau Tag
documents = [] # Data Kalimat Dokumen
ignore_words = ['?', '!'] # Mengabaikan tanda spesial karakter
# Tambahkan data intents dalam json
for intent in data1['intents']:
  responses[intent['tag']]=intent['responses']
  for lines in intent['patterns']:
    inputs.append(lines)
    tags.append(intent['tag'])
    # digunakan untuk pattern atau teks pertanyaan dalam json
    for pattern in intent['patterns']:
      w = nltk.word_tokenize(pattern)
      words.extend(w)
      documents.append((w, intent['tag']))
      # tambahkan ke dalam list kelas dalam data
      if intent['tag'] not in classes:
        classes.append(intent['tag'])

# Konversi data json ke dalam dataframe
data = pd.DataFrame({"patterns":inputs, "tags":tags})

In [33]:
data

Unnamed: 0,patterns,tags
0,hallo,greeting
1,hai,greeting
2,halo,greeting
3,hei,greeting
4,hi,greeting
...,...,...
70,Apa itu PMMB?,penjelasan_PMMB
71,Apa saja persyaratan IISMA?,persyaratan_IISMA
72,Bagaimana periode pelaksanaan dan cara mendaft...,periode_dan_pendaftaran_PMMB
73,Apa saja manfaat apabila mengikuti PMMB?,manfaat_PMMB


## Remove Punctuaction

In [34]:
# Removing Punctuations (Menghilangkan Punktuasi) 
data['patterns'] = data['patterns'].apply(lambda wrd:[ltrs.lower() for ltrs in wrd if ltrs not in string.punctuation])
data['patterns'] = data['patterns'].apply(lambda wrd: ''.join(wrd))

In [35]:
data

Unnamed: 0,patterns,tags
0,hallo,greeting
1,hai,greeting
2,halo,greeting
3,hei,greeting
4,hi,greeting
...,...,...
70,apa itu pmmb,penjelasan_PMMB
71,apa saja persyaratan iisma,persyaratan_IISMA
72,bagaimana periode pelaksanaan dan cara mendaft...,periode_dan_pendaftaran_PMMB
73,apa saja manfaat apabila mengikuti pmmb,manfaat_PMMB


In [36]:
data.to_csv("clean_data_chatbot.csv")

## Lemmatization

In [16]:
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

print (len(words), "kata-kata yang di lematisasi : ", words)

69 kata-kata yang di lematisasi :  ['afternoon', 'apa', 'apabila', 'bagaimana', 'bai', 'banyak', 'bersertifikat', 'bro', 'bumn', 'bye', 'byee', 'cara', 'dadah', 'dah', 'dalam', 'dan', 'good', 'hai', 'hallo', 'halo', 'hei', 'hi', 'hy', 'iisma', 'independen', 'itu', 'jumpa', 'kampus', 'kasih', 'kawan', 'kemendikbud', 'kemensos', 'magang', 'mahasiswa', 'makasih', 'malam', 'manfaat', 'mendaftar', 'mengajar', 'mengikuti', 'merdeka', 'mitra', 'morning', 'muda', 'pagi', 'pejuang', 'pelaksanaan', 'periode', 'persyaratan', 'pertukaran', 'pmmb', 'program', 'saja', 'sampai', 'see', 'selamat', 'si', 'siang', 'sore', 'studi', 'tergabung', 'terima', 'thank', 'thanks', 'tinggal', 'tujuan', 'universitas', 'yang', 'you']


### Menyortir Data Kelas Tags

In [17]:
# sorting pada data class
classes = sorted(list(set(classes)))
print (len(classes), "classes", classes)

38 classes ['goodbye', 'greeting', 'manfaat_IISMA', 'manfaat_PMMB', 'manfaat_kampus_mengajar', 'manfaat_magang', 'manfaat_pejuang_muda', 'manfaat_pertukaran_mahasiswa', 'manfaat_studi_independen', 'mitra_IISMA', 'mitra_PMMB', 'mitra_magang', 'mitra_studi_independen', 'penjelasan_IISMA', 'penjelasan_PMMB', 'penjelasan_kampus_mengajar', 'penjelasan_kampus_merdeka', 'penjelasan_magang', 'penjelasan_pejuang_muda', 'penjelasan_pertukaran_mahasiswa', 'penjelasan_studi_independen', 'periode_dan_pendaftaran_IISMA', 'periode_dan_pendaftaran_PMMB', 'periode_dan_pendaftaran_kampus_mengajar', 'periode_dan_pendaftaran_magang', 'periode_dan_pendaftaran_pejuang_muda', 'periode_dan_pendaftaran_pertukaran_mahasiswa', 'periode_dan_pendaftaran_studi_independen', 'persyaratan_IISMA', 'persyaratan_kampus_mengajar', 'persyaratan_kampus_merdeka', 'persyaratan_magang', 'persyaratan_pejuang_muda', 'persyaratan_pertukaran_mahasiswa', 'persyaratan_studi_independen', 'program_kampus_merdeka', 'terimakasih', 'tuju

### Mencari Jumlah Keseluruhan Data Teks

In [18]:
# documents = kombinasi antara data pattern dengan data tag dalam intents json
print (len(documents), "documents")

425 documents


## Tokenisasi

In [20]:
# Tokenize the data (Tokenisasi Data)
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(data['patterns'])
train = tokenizer.texts_to_sequences(data['patterns'])
train

[[37],
 [38],
 [39],
 [40],
 [41],
 [42],
 [43],
 [44],
 [45],
 [46],
 [47],
 [48],
 [49],
 [50],
 [51],
 [52],
 [53],
 [31],
 [54],
 [55, 31],
 [56, 57],
 [58, 59],
 [60],
 [61, 32],
 [62],
 [63, 32],
 [33, 34],
 [64],
 [33, 34, 65],
 [1, 6, 4, 5],
 [1, 66, 4, 5],
 [1, 7, 8, 4, 5],
 [1, 2, 3, 4, 5],
 [3, 4, 5, 67],
 [3, 4, 5, 68],
 [3, 4, 5, 69],
 [18, 19],
 [20],
 [4, 21],
 [9],
 [22, 23, 5],
 [24, 25],
 [26],
 [1, 6, 18, 19],
 [1, 2, 7, 18, 19],
 [10, 11, 12, 13, 14, 15, 3, 18, 19],
 [1, 2, 16, 17, 8, 18, 19],
 [1, 2, 35, 28, 29, 30, 18, 19],
 [1, 6, 20, 27],
 [1, 2, 7, 20, 27],
 [10, 11, 12, 13, 14, 15, 3, 20, 27],
 [1, 2, 16, 17, 8, 20, 27],
 [1, 2, 35, 28, 29, 30, 20, 27],
 [1, 6, 4, 21],
 [1, 2, 7, 4, 21],
 [10, 11, 12, 13, 14, 15, 3, 4, 21],
 [1, 2, 16, 17, 8, 4, 21],
 [1, 6, 9],
 [1, 2, 7, 9],
 [10, 11, 12, 13, 14, 15, 3, 9],
 [1, 2, 16, 17, 8, 9],
 [1, 2, 36, 28, 29, 30, 3, 9],
 [1, 6, 22, 23, 5],
 [1, 2, 7, 22, 23, 5],
 [10, 11, 12, 13, 14, 15, 3, 22, 23, 5],
 [1, 2, 16, 17,

## Padding

In [22]:
# Melakukan proses padding pada data
x_train = pad_sequences(train)
# Menampilkan hasil padding
print(x_train)

[[ 0  0  0  0  0  0  0  0  0 37]
 [ 0  0  0  0  0  0  0  0  0 38]
 [ 0  0  0  0  0  0  0  0  0 39]
 [ 0  0  0  0  0  0  0  0  0 40]
 [ 0  0  0  0  0  0  0  0  0 41]
 [ 0  0  0  0  0  0  0  0  0 42]
 [ 0  0  0  0  0  0  0  0  0 43]
 [ 0  0  0  0  0  0  0  0  0 44]
 [ 0  0  0  0  0  0  0  0  0 45]
 [ 0  0  0  0  0  0  0  0  0 46]
 [ 0  0  0  0  0  0  0  0  0 47]
 [ 0  0  0  0  0  0  0  0  0 48]
 [ 0  0  0  0  0  0  0  0  0 49]
 [ 0  0  0  0  0  0  0  0  0 50]
 [ 0  0  0  0  0  0  0  0  0 51]
 [ 0  0  0  0  0  0  0  0  0 52]
 [ 0  0  0  0  0  0  0  0  0 53]
 [ 0  0  0  0  0  0  0  0  0 31]
 [ 0  0  0  0  0  0  0  0  0 54]
 [ 0  0  0  0  0  0  0  0 55 31]
 [ 0  0  0  0  0  0  0  0 56 57]
 [ 0  0  0  0  0  0  0  0 58 59]
 [ 0  0  0  0  0  0  0  0  0 60]
 [ 0  0  0  0  0  0  0  0 61 32]
 [ 0  0  0  0  0  0  0  0  0 62]
 [ 0  0  0  0  0  0  0  0 63 32]
 [ 0  0  0  0  0  0  0  0 33 34]
 [ 0  0  0  0  0  0  0  0  0 64]
 [ 0  0  0  0  0  0  0 33 34 65]
 [ 0  0  0  0  0  0  1  6  4  5]
 [ 0  0  0

## Encoding Text

In [25]:
# Melakukan konversi data label tags dengan encoding
le = LabelEncoder()
y_train = le.fit_transform(data['tags'])
print(y_train)

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  0  0  0  0  0  0  0  0  0
 36 36 36 36 36 16 37 30 35 35 35 35 35 35 35 35 35 35 35 20 34 27  8 12
 17 31 24  5 11 15 29 23  4 13 28 21  2  9 19 33 26  7 18 32 25  6 14 28
 22  3 10]


## Input Length, Output Length and Vocabulary

In [26]:
# Melihat hasil input pada data teks
input_shape = x_train.shape[1]
print(input_shape)

10


In [27]:
# Melakukan definisi tiap kalimat dan kata pada data teks
vocabulary = len(tokenizer.word_index)
print("jumlah kata unik : ", vocabulary)

# Melakukan pemeriksaan pada data output label teks
output_length = le.classes_.shape[0]
print("panjang output: ", output_length)

jumlah kata unik :  69
panjang output:  38


Input length dan output length terlihat sangat jelas hasilnya. Mereka adalah untuk bentuk input dan bentuk output dari data train atau latih yang akan diproses pada algoritma LSTM yang akan dilatih.

Vocabulary Size adalah untuk lapisan penyematan untuk membuat representasi vektor unik untuk setiap kata.

## Save Model Words & Classes

In [30]:
# Simpan hasil pemrosesan teks dengan menggunakan pickle
pickle.dump(words, open('words.pkl','wb'))
pickle.dump(classes, open('classes.pkl','wb'))

## Save Label Encoder & Tokenizer

In [31]:
pickle.dump(le, open('le.pkl','wb'))
pickle.dump(tokenizer, open('tokenizers.pkl','wb'))