In [1]:
import random
from random import shuffle
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.tag import CRFTagger
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Cleaning Text

In [2]:
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ")
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

In [3]:
from nltk.corpus import stopwords

# get Indonesian stopword
stop_words = set(stopwords.words('indonesian'))
len(stop_words)

757

## Example Token

In [5]:
original = 'Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik <a href="http://bais.ub.ac.id/">disini</a> (atau icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student'
original_http_cleaned = preprocess_text(original)

In [6]:
og_tokens = nltk.tokenize.word_tokenize(get_only_chars(original_http_cleaned))

In [7]:
text = "Saya login siam, tetapi tidak bisa disitu teretera tulisan Silahkan klik disini atau icon AKTVASI di atas untuk melakukan aktivasi account padahal saya sudah aktivasi akun student"
text2 = "Yth Admin Helpdesk TIK UB saya mendapat labtop baru pengadaan kantor PJM Saya memerlukan Office 365 untuk laptop baru tersebut Mohon dapat dibantu"
tokens = nltk.tokenize.word_tokenize(get_only_chars(text))
tokens

['saya',
 'login',
 'siam',
 'tetapi',
 'tidak',
 'bisa',
 'disitu',
 'teretera',
 'tulisan',
 'silahkan',
 'klik',
 'disini',
 'atau',
 'icon',
 'aktvasi',
 'di',
 'atas',
 'untuk',
 'melakukan',
 'aktivasi',
 'account',
 'padahal',
 'saya',
 'sudah',
 'aktivasi',
 'akun',
 'student']

## Synonym Dictionary
https://github.com/victoriasovereigne/tesaurus
Indonesian Thesaurus in json format, taken from the Central Indonesian Language Thesaurus by the Department of National Education in 2008. The online version is taken from the following link:

https://theindonesianwriters.files.wordpress.com/2011/04/kamus-tesaurus_bahasa-indonesia.pdf

The dict.json file is a dump of the Python dictionary, with

key: word entry you want to search for
value: dictionary contains tags, synonyms, and antonyms
Example:

unik --> {'tag': 'a', 'synonym': ['distingtif', 'eksklusif', 'idiosinkratis', 'individual', 'istimewa', 'khas', 'khusus', 'partikular', 'singularis', 'solo', 'spesial', 'spesifik', 'tersendiri', 'tunggal'], 'antonym': ['biasa']}

Daftar tag:

a = adjective
adv = adverb
ki = figure of speech
n = noun
num = numeralia
p = particle
pron = pronoun
v = verb

In [8]:
import json

# ==================================================
# Read dictionary from json
# ==================================================
def load(filename):
	with open(filename) as data_file:
		data = json.load(data_file)

	return data

# load dictionary
mydict = load('dict.json')

In [9]:
# ==================================================
# Look for synonyms for a word from the dictionary
# ==================================================
def getSinonim(word):
	if word in mydict.keys():
		return mydict[word]['sinonim']
	else:
		return []

# ==================================================
# Look for antonym for a word from the dictionary
# ==================================================
def getAntonim(word):
	if word in mydict.keys():
		if 'antonim' in mydict[word].keys():
			return mydict[word]['antonim']

	return []

In [10]:
print(getSinonim('saya'))
print(getAntonim('gagal'))

['abdi', 'aku', 'ana ', 'beta', 'ego', 'gua ', 'hamba', 'kami', 'kawula', 'kita ', 'patik']
['berhasil']


## POS Tagging

POS tagging in this study uses a pretrained model (1.6 MB) which is trained using Fam Rashel data (200K+ tokens)
https://yudiwbs.wordpress.com/2018/02/20/pos-tagger-bahasa-indonesia-dengan-pytho/
https://github.com/famrashel/idn-tagged-corpus

In [11]:
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

def get_pos_tag(word):
	result = ct.tag_sents([word])
	result = [tag for word, tag in result[0]]
	return result
def get_pos_tag_with_word(word):
	result = ct.tag_sents([word])
	return result

def compare_pos_tag(word1, word2):
	pos1 = get_pos_tag([word1])
	pos2 = get_pos_tag([word2])
	return (pos1 == pos2)

## FastText

In [12]:
from gensim.models import KeyedVectors

# Path to the downloaded .vec.gz file
model_path = 'cc.id.300.vec.gz'

# Load the model (this may take some time)
fasttext_model = KeyedVectors.load_word2vec_format(model_path, binary=False)

## Synonym Replacement

In [55]:
########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from dictionary
########################################################################
from nltk.corpus import wordnet

def synonym_replacement(sentences, n):
    new_sentences = []
    words = nltk.tokenize.word_tokenize(get_only_chars(sentences))
    random_word_list = words
    random.shuffle(random_word_list)
    words_replaced = 0
    while words_replaced < n and random_word_list:
        random_word = random_word_list.pop()
        synonyms = get_synonyms(random_word)
        if not synonyms:
            continue
        valid_sentences = []
        for synonym in synonyms:
            if compare_pos_tag(random_word, synonym):
                valid_sentences.append(replace_word(sentences, random_word, synonym))
        if valid_sentences:
            new_sentences.extend(valid_sentences)
            words_replaced += 1
    return new_sentences


def get_synonyms(word):
	synonyms = set(getSinonim(word))
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

def replace_word(sentences, word_old, word_new):
    # Use regular expressions to search for old words without paying attention to spaces
    new_sentences = re.sub(r'\b' + re.escape(word_old) + r'\b', word_new, sentences)
    return new_sentences

## Random Insertion

In [14]:
########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(sentences, n):
	new_sentences = []
	words = nltk.tokenize.word_tokenize(get_only_chars(sentences))
	new_words = words.copy()
	for _ in range(n):
		synonyms, target_idx = add_word(new_words)
		if synonyms == "":
			return sentences
		for synonym in synonyms:
			new_sentence = insertion(sentences, target_idx, synonym)
			new_sentences.append(new_sentence)
	return new_sentences

def insertion(sentences, target, synonym):
    # Mencari posisi target dalam sentences
    target_position = sentences.find(target)

    # Memastikan target ditemukan dalam sentences
    if target_position != -1:
        # Menambahkan synonym setelah target
        formated_sentences = sentences[:target_position + len(target)] + " " + synonym + sentences[target_position + len(target):]
        return formated_sentences
    else:
        # Jika target tidak ditemukan, mengembalikan sentences asli
        return sentences
	
def add_word(new_words):
	temp = new_words.copy()
	synonyms = []
	counter = 0
	inserted = False
	while not inserted:
		if not temp:
			return "", ""
		random_word_idx = random.randint(0, len(temp) - 1)
		random_word = temp[random_word_idx]
		synonyms = get_synonyms(random_word)
		# Filter synonyms to make sure they have the same POS tag as the original word
		valid_synonyms = [syn for syn in synonyms if compare_pos_tag(random_word, syn)]
		counter += 1
		if valid_synonyms:
			# random_synonym = random.choice()
			# new_words.insert(random_word_idx + 1, random_synonym)  # Memasukkan sinonim setelah kata acak
			# print("memasukkan ", random_synonym, " setelah ", random_word)
			inserted = True
			return valid_synonyms, random_word
		# if counter >= 10:
		# 	return "", ""
		else:
			del temp[random_word_idx]


## Random Swap

In [15]:
import numpy as np
from scipy.spatial.distance import cosine
# Updated aspect terms specific to IT helpdesk context in Indonesian
aspect_terms = ["login", "vpn", "email", "lisensi office", "webhosting", "nilai it", "terima kasih"]

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(sentence):
    words = sentence.split()  # Simple tokenization based on whitespace, you can replace this with a proper tokenizer for Indonesian.
    if len(words) < 2:  # Need at least 2 words to perform a swap
        return sentence
    
    w1, w2 = np.random.choice(words, 2, replace=False)
    
    for _ in range(10):  # Attempt 10 times
        pos_w1 = get_pos_tag(w1)
        pos_w2 = get_pos_tag(w2)
        
        # Check if any is an adjective or similar to aspect terms
        if pos_w1 == 'ADJ' or pos_w2 == 'ADJ' or \
           is_similar_to_aspect_terms(w1, aspect_terms) or \
           is_similar_to_aspect_terms(w2, aspect_terms):
            w1, w2 = np.random.choice(words, 2, replace=False)  # Pick two different words
        else:
            break
    
    # Swap the words if they are different
    if w1 != w2:
        index_w1, index_w2 = words.index(w1), words.index(w2)
        words[index_w1], words[index_w2] = w2, w1
    
    return ' '.join(words)

def get_vector(word):
    # Splits the word into tokens if there are spaces and then averages their vectors
    return np.mean([fasttext_model[token] for token in word.split() if token in fasttext_model], axis=0)

def is_similar_to_aspect_terms(word, aspect_terms, threshold=0.5):
    if word not in fasttext_model:
        return False
    
    word_vector = get_vector(word)
    
    for aspect_term in aspect_terms:
        aspect_term_vector = get_vector(aspect_term)
        similarity = 1 - cosine(word_vector, aspect_term_vector)  # Cosine similarity
        if similarity > threshold:
            return True
            
    return False

## Random Deletion

In [16]:
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from scipy.spatial.distance import cosine

aspect_terms = ["login", "vpn", "email", "lisensi office", "webhosting", "nilai it", "terima kasih"]

def is_similar_to_aspect_terms(word, aspect_terms, threshold=0.3):
    if word not in fasttext_model:
        return False
    
    word_vector = get_vector(word)
    
    for aspect_term in aspect_terms:
        aspect_term_vector = get_vector(aspect_term)
        similarity = 1 - cosine(word_vector, aspect_term_vector)  # Cosine similarity
        if similarity > threshold:
            return True
            
    return False

def get_vector(word):
    # Splits the word into tokens if there are spaces and then averages their vectors
    return np.mean([fasttext_model[token] for token in word.split() if token in fasttext_model], axis=0)

def random_deletion(sentence, p=0.3):
    tokens = word_tokenize(sentence)
    pos_tags = get_pos_tag_with_word(tokens)
    deletion_occurred = False
    new_sentence = []

    for word, pos in pos_tags[0]:
        random_number = random.uniform(0, 1)  # Generate a new random number for each word
        if pos.startswith('JJ'):
            new_sentence.append(word)  # Always keep adjectives
        elif (pos.startswith('NN') or pos.startswith('VB')) and is_similar_to_aspect_terms(word, aspect_terms):
            new_sentence.append(word)  # Always keep important nouns and verbs
        elif random_number < p:
            deletion_occurred = True  # Update flag if a word is deleted
            continue  # Delete the word
        else:
            new_sentence.append(word)  # Keep the word

    return ' '.join(new_sentence) if deletion_occurred else sentence




## Testing

In [54]:
x = synonym_replacement(original_http_cleaned, 1)
print(original_http_cleaned)
for i in range (len(x)):
  print(x[i])
print(type(x))

Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau icon AKTVASI pada atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau icon AKTVASI dalam atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
<class 'list'>


In [18]:
y = random_insertion(original_http_cleaned, 1)
print(original_http_cleaned)
for i in range (len(y)):
  print(y[i])
print(type(y))

Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau maupun icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau ataupun icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
<class 'list'>


In [19]:
print(original_http_cleaned)
print(random_deletion(original_http_cleaned))

Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
Saya siam tetapi bisa disitu teretera tulisan `` Silahkan klik disini ( atau AKTVASI di atas ) untuk melakukan aktivasi account ! '' padahal saya sudah aktivasi akun


In [20]:
swapped_sentence = random_swap(original_http_cleaned)
print(original_http_cleaned)
print(swapped_sentence)

Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik  disini  (atau icon AKTVASI di atas) untuk melakukan aktivasi account!" padahal saya sudah aktivasi akun @student
Saya login siam tetapi tidak bisa, disitu teretera tulisan "Silahkan klik disini (atau icon AKTVASI di atas) untuk melakukan @student account!" padahal saya sudah aktivasi akun aktivasi


## Data Preparation

In [21]:
import pandas as pd
# Read Excel file into a Pandas DataFrame
df = pd.read_excel("Rekap Dataset-v0.3.xlsx")
df

Unnamed: 0,no,intensi,chat,thread_id,entitas
0,1.0,gagal_login,"Saya login siam tetapi tidak bisa, disitu tere...",11.0,aplikasi:siam
1,2.0,masalah_vpn,Saya tidak bisa terhubung/gagal terhubung deng...,48.0,aplikasi:VPN UB
2,3.0,mengajukan_email,"Selamat sore Pak,<br />Saya Dini Adriani, sala...",54.0,orang:Dini Adriani|organisasi:BIPA;LIH FIB UB
3,4.0,masalah_vpn,assalamualaikum <br />saya mau tanya kenapa sa...,104.0,aplikasi:vpn ub
4,5.0,masalah_vpn,"Vpn nya selalu gabisa min, muncul seperti di g...",197.0,aplikasi:vpn
...,...,...,...,...,...
702,663.0,nilai_it_tidak_keluar,"<p>Selamat Sore Min, Saya Izhar Yusuf Sadhana ...",,orang:Izhar Yusuf Sadhana|organisasi:Prodi IK
703,664.0,nilai_it_tidak_keluar,"<p>Assalamualaikum, permisi, Saya:</p> <p>Nama...",,orang:Mohamad Iqbal Septian Hamdany
704,665.0,nilai_it_tidak_keluar,<p>Ingin memperoleh informasi terkait nilai da...,,
705,666.0,nilai_it_tidak_keluar,<p>Selamat siang saya Siska Silvia Iindarwati ...,,orang:Siska Silvia Iindarwati


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   no         635 non-null    float64
 1   intensi    707 non-null    object 
 2   chat       707 non-null    object 
 3   thread_id  609 non-null    float64
 4   entitas    478 non-null    object 
dtypes: float64(2), object(3)
memory usage: 27.7+ KB


### Cleaning Dataset

In [23]:
import re

def preprocessing(df):
    # Assuming 'chat' is the column name in your DataFrame
    df['chat'] = df['chat'].apply(lambda text: preprocess_text(text))
    return df

def preprocess_text(text):
    text = re.sub(r'<[^>]*>', ' ', text) #HTML Tag remover
    text = text.replace('/', ' atau ') #Replace . with atau
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) #Replace non-ascii characters
    # email_pattern = r'\b[\w\.-]+@([\w-]+\.)+[\w-]{2,4}\b'
    # words = text.split()
    # for i in range(len(words)):
    #     if re.match(email_pattern, words[i]):
    #         words[i] = words[i]
    #     else:
    #         # words[i] = re.sub(r'[^\w]', ' ', words[i])
    #         text = ' '.join(words)
    #         text = ' '.join(text.split())
    # text = ' '.join(words)
    return text


In [24]:
df = preprocessing(df)

In [25]:
df

Unnamed: 0,no,intensi,chat,thread_id,entitas
0,1.0,gagal_login,"Saya login siam tetapi tidak bisa, disitu tere...",11.0,aplikasi:siam
1,2.0,masalah_vpn,Saya tidak bisa terhubung atau gagal terhubung...,48.0,aplikasi:VPN UB
2,3.0,mengajukan_email,"Selamat sore Pak, Saya Dini Adriani, salah sat...",54.0,orang:Dini Adriani|organisasi:BIPA;LIH FIB UB
3,4.0,masalah_vpn,assalamualaikum saya mau tanya kenapa saat me...,104.0,aplikasi:vpn ub
4,5.0,masalah_vpn,"Vpn nya selalu gabisa min, muncul seperti di g...",197.0,aplikasi:vpn
...,...,...,...,...,...
702,663.0,nilai_it_tidak_keluar,"Selamat Sore Min, Saya Izhar Yusuf Sadhana NI...",,orang:Izhar Yusuf Sadhana|organisasi:Prodi IK
703,664.0,nilai_it_tidak_keluar,"Assalamualaikum, permisi, Saya: Nama : Moha...",,orang:Mohamad Iqbal Septian Hamdany
704,665.0,nilai_it_tidak_keluar,Ingin memperoleh informasi terkait nilai dari...,,
705,666.0,nilai_it_tidak_keluar,Selamat siang saya Siska Silvia Iindarwati te...,,orang:Siska Silvia Iindarwati


## Implementing Modified Easy Data Augmentation

### Synonym Replacement

In [103]:
df_synonym_replaced = pd.DataFrame(columns=['chat', 'intensi'])

In [134]:
count = 0

# Iterate through each row in the "chat" and "intent" columns of the DataFrame df
for index, row in df.iterrows():
    original_sentence = row['chat']
    intent = row['intensi']

    replaced_sentences = synonym_replacement(original_sentence, n=1)

    count += len(replaced_sentences)

    # Create a temporary DataFrame for synonym results
    df_temp = pd.DataFrame({'chat': replaced_sentences, 'intensi': intent})

    # Combine temporary DataFrame with df synonym replaced
    df_synonym_replaced = pd.concat([df_synonym_replaced, df_temp], ignore_index=True)

print(count)

2782


In [135]:
df_synonym_replaced

Unnamed: 0,chat,intensi
0,"Saya login siam tetapi tidak bisa, disitu tere...",gagal_login
1,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
2,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
3,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
4,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
...,...,...
6080,Ingin memperoleh informasi tergantung nilai d...,nilai_it_tidak_keluar
6081,Selamat siang saya Siska Silvia Iindarwati te...,nilai_it_tidak_keluar
6082,Selamat siang saya Siska Silvia Iindarwati te...,nilai_it_tidak_keluar
6083,"Selamat siang bapak atau ibu, maaf mengganggu...",nilai_it_tidak_keluar


In [107]:
df_synonym_replaced.to_csv('df_synonym_replaced.csv', index=False)

### Random Insertion

In [116]:
df_random_insertion = pd.DataFrame(columns=['chat', 'intensi'])
df_list = []  # List to store individual DataFrames

In [117]:
count = 0
data_to_concat = []

for index, row in df.iterrows():
    original_sentence = row['chat']
    intent = row['intensi']

    # Replace the number of words with synonyms using the random_insertion function
    # For example, replace 1 word with a synonym
    replaced_sentence = random_insertion(original_sentence, n=1)
    

    # Add the count to the number of sentences resulting from random insertion
    count += 1

    # Save random insertion results in a list for concatenation
    # Create a temporary DataFrame for synonym results
    df_temp = pd.DataFrame({'chat': [replaced_sentence], 'intensi': [intent]})

    # Combine temporary DataFrame with df synonym replaced
    df_random_insertion = pd.concat([df_random_insertion, df_temp], ignore_index=True)
print(count)

707


In [118]:
df_random_insertion

Unnamed: 0,chat,intensi
0,"[Saya login siam tetapi tidak tak bisa, disitu...",gagal_login
1,[Saya tidak bisa terhubung atau gagal terhubun...,masalah_vpn
2,"[Selamat sore Pak, Saya Dini Adriani, salah sa...",mengajukan_email
3,[assalamualaikum saya mau tanya kenapa saat m...,masalah_vpn
4,"[Vpn nya selalu terus-menerus gabisa min, munc...",masalah_vpn
...,...,...
702,"[ Selamat Sore Min, Saya Izhar Yusuf Sadhana N...",nilai_it_tidak_keluar
703,"[ Assalamualaikum, permisi, Saya: Nama : Moh...",nilai_it_tidak_keluar
704,[ Ingin memperoleh informasi terkait terikat n...,nilai_it_tidak_keluar
705,[ Selamat siang saya Siska Silvia Iindarwati t...,nilai_it_tidak_keluar


In [119]:
df_random_insertion.to_csv('df_random_insertion.csv', index=False)

### Random Deletion

In [120]:
df_random_deletion = pd.DataFrame(columns=['chat', 'intensi'])

In [121]:
count = 0
data_to_concat = []

for index, row in df.iterrows():
    original_sentence = row['chat']
    intent = row['intensi']

    deleted_sentence = random_deletion(original_sentence)

    count += 1 

    data_to_concat.append({'chat': deleted_sentence, 'intensi': intent})

# Concatenate the list of dictionaries into a DataFrame
df_random_deletion = pd.concat([df_random_deletion, pd.DataFrame(data_to_concat)], ignore_index=True)

print(count)

707


In [122]:
df_random_deletion

Unnamed: 0,chat,intensi
0,"Saya siam tetapi tidak , disitu teretera tulis...",gagal_login
1,Saya tidak terhubung atau gagal terhubung deng...,masalah_vpn
2,"Selamat sore Pak , Saya salah Dosen BIPA , LIH...",mengajukan_email
3,assalamualaikum mau tanya kenapa saat menghubu...,masalah_vpn
4,"Vpn nya gabisa min , muncul seperti di gambar",masalah_vpn
...,...,...
702,"Selamat Sore , Saya Izhar Yusuf Sadhana NIM 17...",nilai_it_tidak_keluar
703,"Assalamualaikum , permisi , : Nama : Iqbal Sep...",nilai_it_tidak_keluar
704,Ingin memperoleh informasi terkait nilai dari...,nilai_it_tidak_keluar
705,Selamat Siska Silvia Iindarwati telah melakuka...,nilai_it_tidak_keluar


In [123]:
df_random_deletion.to_csv('df_random_deletion.csv', index=False)

### Random Swap

In [124]:
df_random_swap = pd.DataFrame(columns=['chat', 'intensi'])

In [125]:
count = 0
data_to_concat = []

for index, row in df.iterrows():
    original_sentence = row['chat']
    intent = row['intensi']

    inserted_sentence = random_swap(original_sentence)

    count += 1

    data_to_concat.append({'chat': inserted_sentence, 'intensi': intent})

# Concatenate the list of dictionaries into a DataFrame
df_random_swap = pd.concat([df_random_swap, pd.DataFrame(data_to_concat)], ignore_index=True)

print(count)

707


In [126]:
df_random_swap

Unnamed: 0,chat,intensi
0,Saya login siam tetapi tidak untuk disitu tere...,gagal_login
1,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
2,"Selamat ini, Pak, Saya Dini Adriani, salah sat...",mengajukan_email
3,assalamualaikum saya mau tanya kenapa saat men...,masalah_vpn
4,"Vpn nya gabisa selalu min, muncul seperti di g...",masalah_vpn
...,...,...
702,"Selamat Sore Min, Saya Izhar Yusuf Sadhana NIM...",nilai_it_tidak_keluar
703,"Assalamualaikum, permisi, Saya: Nama : Mohamad...",nilai_it_tidak_keluar
704,dilakukan memperoleh informasi terkait nilai d...,nilai_it_tidak_keluar
705,Selamat siang saya Siska Silvia Iindarwati tel...,nilai_it_tidak_keluar


In [127]:
df_random_swap.to_csv('df_random_swap.csv', index=False)

## Combine All Augmented Dataset

In [128]:
# Combine DataFrames sequentially (vertical/row-wise)
df_augmented = pd.concat([df_random_swap, df_random_deletion, df_random_insertion, df_synonym_replaced, df[['chat', 'intensi']]], ignore_index=True)

In [129]:
df_augmented

Unnamed: 0,chat,intensi
0,Saya login siam tetapi tidak untuk disitu tere...,gagal_login
1,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
2,"Selamat ini, Pak, Saya Dini Adriani, salah sat...",mengajukan_email
3,assalamualaikum saya mau tanya kenapa saat men...,masalah_vpn
4,"Vpn nya gabisa selalu min, muncul seperti di g...",masalah_vpn
...,...,...
6126,"Selamat Sore Min, Saya Izhar Yusuf Sadhana NI...",nilai_it_tidak_keluar
6127,"Assalamualaikum, permisi, Saya: Nama : Moha...",nilai_it_tidak_keluar
6128,Ingin memperoleh informasi terkait nilai dari...,nilai_it_tidak_keluar
6129,Selamat siang saya Siska Silvia Iindarwati te...,nilai_it_tidak_keluar


In [130]:
# Remove duplicate value
df_augmented = df_augmented.drop_duplicates(subset='chat', keep='first')

In [131]:
df_augmented.reset_index(drop=True, inplace=True)

In [132]:
df_augmented

Unnamed: 0,chat,intensi
0,Saya login siam tetapi tidak untuk disitu tere...,gagal_login
1,Saya tidak bisa terhubung atau gagal terhubung...,masalah_vpn
2,"Selamat ini, Pak, Saya Dini Adriani, salah sat...",mengajukan_email
3,assalamualaikum saya mau tanya kenapa saat men...,masalah_vpn
4,"Vpn nya gabisa selalu min, muncul seperti di g...",masalah_vpn
...,...,...
5567,Assalamualaikum Selamat Siang Admin Kebetul...,nilai_it_tidak_keluar
5568,"Selamat Sore Min, Saya Izhar Yusuf Sadhana NI...",nilai_it_tidak_keluar
5569,"Assalamualaikum, permisi, Saya: Nama : Moha...",nilai_it_tidak_keluar
5570,Selamat siang saya Siska Silvia Iindarwati te...,nilai_it_tidak_keluar


In [133]:
df_augmented.to_csv('df_augmented.csv', index=False)