In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder
from deep_translator import GoogleTranslator
from nlp_id.lemmatizer import Lemmatizer 
import re

from tqdm import tqdm
import sys
import os
import re

In [2]:
def create_submission(predicted, path = "submission.csv"):
    folder_loc = '/'.join([i for i in path.split("/")][:-1])
    if not os.path.exists(folder_loc) and folder_loc != '':
        os.makedirs(folder_loc)
    df = pd.read_excel("Data/Submission_Format.xlsx")
    df["label"] = predicted
    df.to_csv(path, index=False)

In [3]:
# Was run on
print(f'PY version   : {sys.version}\nHardware     : {tf.config.list_physical_devices()[0]}')

PY version   : 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Hardware     : PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


# Wrangling

In [4]:
# input Data
train_data = pd.read_excel("Data/train.xlsx")
test_data = pd.read_excel("Data/Test.xlsx")

## Exploration

In [5]:
# Check for Dupllicate
sum(train_data.duplicated())        # 381

# Further examination
temp = train_data[train_data.duplicated()].copy()
for counter, text in enumerate(temp.sort_values("text").values):
    print(text)
    if counter==10: break

# Conclusion: Ada duplikat pada train data, ada yang fitur dan targetnya sama. Namun, ada juga yang fiturnya sama namun targetnya berbeda

['@gQ+QGmYJ209N7Py+H3gRyakQiic4NLEklTOIIuALnZA= @gqAL2HIcdWKR2U/VFUq3R0TFxXxtxCsKyUXAKn9R5o0= Iya nih penting suara Batak sebagai populasi etnis ketiga terbesar setelah Jawa Sunda. Mayoritas Batak Kristen cukup anti sama Anies karena diframing radikal sejak'
 'Demografi']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginform

In [6]:
# Check imbalanced Data
temp = {}
targets = list(np.unique(train_data["label"].values))

for i in targets:
    temp[i] = len(train_data[train_data["label"]==i])
temp

# Conclusion: Data's unbalanced, needs handling

{'Demografi': 62,
 'Ekonomi': 367,
 'Geografi': 20,
 'Ideologi': 400,
 'Pertahanan dan Keamanan': 400,
 'Politik': 2972,
 'Sosial Budaya': 587,
 'Sumber Daya Alam': 192}

In [7]:
# Find the most length text
counter=[]
for i in train_data["text"].values:
    counter.append(len(i))
sorted(list(zip([i for i in range(len(train_data))], train_data["text"].values, train_data["label"].values,counter)), key = lambda x: x[3], reverse=True)[:1000]

# Conclusion: Data needs to be cleaned

[(688,
  'RT ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚â€™ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å“ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¤ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â§ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â£ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â©ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚â€ºÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â°ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9dÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¸ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9d ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¬ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\xadÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\xadÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ 0,7 ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9dÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ 100 Food estate mulai dicanangkan pada 3 tahun lalu, tepatnya pada 6 Juli 2020 ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x8fÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¬ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã

What needs to be cleaned:
1. RT
2. TAG
3. LINK
4. Reply
5. UTF-8 encoded and emoji
6. Number, comma, dot, etc etc
7. Kata dengan 1 huruf

## Cleaning

In [8]:
# def cleantextv1(text_list,
#               case1:  bool = True,
#               case2: bool = False,
#               case3: bool = False,
#               case4: bool = False):
#     '''
#     returns cleaned string from messy raw string that's unreadable for the model.
    
#     Case1: Removes UTF-8 encoded, number, comma, dot, etc. But keeps number on hastag
#     Case2: This removes hastag entirely
#     Case3: removes words with 2-3 characters
#     Case4: Forcefully translate to indonesian
#     '''
#     new_text = []
#     for text in tqdm(text_list):
#         if case1:
#             # Removes '&amp;' from text, html stuff.
#             text = re.sub(r'&amp;', '', text)
#             text = re.sub(r' +'," ", ''.join(re.findall(r'[ a-zA-Z@0-9[\]#]',text)))
                    
#             # Split input string into parts: hashtags and non-hashtag parts
#             parts = re.split(r'(#\w*)', text)

#             # Pattern to match hashtags
#             hashtag_pattern = re.compile(r'#\w*')
#             # Apply digit replacement to non-hashtag parts
#             processed_parts = [
#                 part if hashtag_pattern.match(part) else re.sub(r'\d', '', part)
#                 for part in parts
#             ]
#             # Reconstruct the string
#             text = ''.join(processed_parts)
        
#         # This removes 'RT ' from the text as the first word.
#         text =re.sub(r'RT ', '', text)
#         # Change text into lowercase
#         text = text = text.lower()
#         # This removes link https
#         text = re.sub(r'http\S+', "", text)
#         # Removes Tags  @xxxx
#         text = re.sub(r'@\S+ ',"", text)
#         # Removes word with 1 char
#         text = re.sub(r' \w ', " ", text)
#         # Removes Reply [re xxxx]
#         text = re.sub(r' \[re \w+]',"", text)
        
#         if case2: 
#             text = re.sub(r'#\S+ ', '', text)
        
#         if case3:
#             text = re.sub(r'\b\w{2,3}\b', text)
        
#         if case4:
#             text = GoogleTranslator(source='en', target='id').translate(text)
        
#         new_text.append(re.sub(" +", " ", text).strip(" "))
            
#     return new_text

# # a = train_data["text"][4]
# # cleantext(a)

# lemmatizer = Lemmatizer()
# def lemarization(text_list):
#     new_text = []
#     lemmatizer = Lemmatizer()
#     for text in tqdm(text_list):
#         text = lemmatizer.lemmatize(text) 
#         new_text.append(text)
#     return new_text


def cleantextv2(list_text, translate = False):
    new_text = []
    index = 0
    for text in tqdm(list_text):
        # This removes 'RT ' from the text as the first word.
        text =re.sub(r'RT ', '', text)
        # Removes Reply [re xxxx]
        text = re.sub(r'\[re \w+]',"", text, flags=re.IGNORECASE)
        # This removes link https
        text = re.sub(r'http\S+', "", text)
        # Removes Tags  @xxxx
        text = re.sub(r'@\S+',"", text)
        # Removes '&amp;' from text, html stuff.
        text = re.sub(r'&amp;', '', text)
        # Removes \xad from text
        text = re.sub(r'\xad', '', text)
        # Change these specific chars into space
        text = re.sub(r'[,.?!\'\"()-]', " ", text)

        text = ''.join(re.findall(r'[ a-zA-Z0-9#]', text))
        text = re.sub(r' +', " ", text)

        # Split input string into parts: hashtags and non-hashtag parts
        parts = re.split(r'(#\w*)', text)

        # Pattern to match hashtags
        hashtag_pattern = re.compile(r'#\w*')
        # Apply digit replacement to non-hashtag parts
        processed_parts = [
            part if hashtag_pattern.match(part) else re.sub(r'\d', '', part)
            for part in parts
        ]
        # Reconstruct the string
        text = (''.join(processed_parts)).lower()
        # Removes word with 1 char
        text = re.sub(r'\b\w\b', " ", text)
        
        if translate:
            try:
                text = GoogleTranslator(source='en', target='id').translate(text)
            except:
                print(f'Failed, index: {index}')
            
        # Append, remove double space, remove space in the start and the end of the string
        new_text.append(re.sub(" +", " ", text).strip(" "))
        index+=1
        
    return new_text

thelemarization = Lemmatizer()
def lemarization(list_text):
    new_text = []
    for text in tqdm(list_text):
        text = thelemarization.lemmatize(text)
        new_text.append(text)
    return new_text

### English

In [11]:
# # Get cleaned data
# train_data_cleaned = train_data.copy()
# train_data_cleaned["text"] = cleantextv2(train_data_cleaned["text"], translate=True)
# train_data_cleaned["text_uncleaned"] = train_data["text"]

# # For further examination
# #train_data_cleaned.to_csv("Temp/cleaned_dataV2_translated.csv")
# train_data_cleaned = train_data_cleaned.drop("text_uncleaned", axis=1)

100%|██████████| 5000/5000 [52:13<00:00,  1.60it/s]  


In [9]:
train_data_cleaned = pd.read_csv("Temp/cleaned_dataV2_translated.csv").iloc[:,1:3]
train_data_cleaned.head()

Unnamed: 0,text,label
0,kunjungan prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,anies dapat bertepuk tangan meriah saat jadi r...,Politik
2,emng bener sih pendukung ada yg goblok begitu ...,Demografi
3,sewaktu anies menyatakan kritis ke kinerja pak...,Politik
4,anies baswedan harap asn termasuk TNI dan Polr...,Politik


In [10]:
train_data_cleaned.sort_values("text", ascending=True)["text"].values[-10:]
# There's nan value, original text is only made up with tags only
train_data_cleaned = train_data_cleaned.dropna()

In [11]:
# Get word made with 2 or 3 characters
makeshift_stopwords={}
for text in tqdm(train_data_cleaned["text"]):
    text = re.sub("#", "1", text)
    for word in re.findall(r"\b[a-z]{2,3}\b", text):
       if word not in list(makeshift_stopwords.keys()):
           makeshift_stopwords[word]=1
    else: makeshift_stopwords[word]+=1

# Get word made from 2 or 3 characters from indonesian dictionary
good_words = []
for i in pd.read_table("Indo_dict.txt", header=None)[0].values:
    i = str(i).split(" ")[0]
    if a:=re.search(r'^\w{2,3}$', i):
        good_words.append(a[0])

# 2-3 lenghth Words from dataset that aren't in the dictionary
temp = {}
lengthwords2_3notindict = [i for i in list(makeshift_stopwords.keys()) if i not in good_words]
for i in lengthwords2_3notindict:
    temp[i] = makeshift_stopwords[i]
np.array(sorted(list(temp.items()), key= lambda x:x[1], reverse=True))[:200]
# Conclusion: 2 - 3 length words are too valuable to be removed, but some can be filtered

100%|██████████| 4991/4991 [00:00<00:00, 35714.20it/s]


array([['md', '246'],
       ['yg', '152'],
       ['no', '93'],
       ['gak', '51'],
       ['utk', '40'],
       ['ga', '28'],
       ['dki', '19'],
       ['jnd', '17'],
       ['dg', '16'],
       ['bro', '15'],
       ['tpn', '15'],
       ['app', '12'],
       ['jnk', '12'],
       ['ahn', '12'],
       ['jna', '11'],
       ['tkn', '11'],
       ['gor', '10'],
       ['pks', '10'],
       ['in', '9'],
       ['kpk', '9'],
       ['pd', '9'],
       ['tdk', '8'],
       ['bgt', '8'],
       ['jnc', '8'],
       ['kpu', '8'],
       ['ck', '8'],
       ['jg', '7'],
       ['ite', '7'],
       ['dll', '7'],
       ['rmh', '7'],
       ['jk', '7'],
       ['the', '7'],
       ['sdh', '7'],
       ['ipb', '7'],
       ['pa', '7'],
       ['jnb', '7'],
       ['jis', '6'],
       ['jd', '6'],
       ['jt', '6'],
       ['bs', '6'],
       ['ahy', '6'],
       ['cc', '6'],
       ['dpt', '6'],
       ['psi', '6'],
       ['cnn', '6'],
       ['dlm', '5'],
       ['new', '5'],
       [

In [12]:
# Lemarization
train_data_cleaned_lemarization = train_data_cleaned.copy()
train_data_cleaned_lemarization["text"] = lemarization(train_data_cleaned_lemarization["text"].values)

# train_data_cleaned_lemarization.to_csv("Temp/cleaned_datav2_translated_lemarized.csv")

100%|██████████| 4991/4991 [00:00<00:00, 17110.15it/s]


In [17]:
# Stop words
stopwords = pd.read_table("src/stopwords_id.txt", header=None)[0].values
new_text = []
for text in tqdm(train_data_cleaned_lemarization["text"]):
    new_text.append(' '.join([i for i in text.split(" ")if i not in stopwords]))
    
train_data_cleaned_lemarization_stopwords = train_data_cleaned_lemarization.copy()
train_data_cleaned_lemarization_stopwords["text"] = new_text

  0%|          | 0/4991 [00:00<?, ?it/s]

100%|██████████| 4991/4991 [00:02<00:00, 1884.50it/s]


## Handling

In [180]:
# Same value diff target

same_val = []
for i in np.unique(train_data_cleaned["text"].values):
    if (count:= len(train_data_cleaned[train_data_cleaned["text"] == i])) > 1:
        df = train_data_cleaned[train_data_cleaned["text"]== i]
        if len(unique_label:= df["label"].unique()) >1:
            for j in unique_label:
                same_val.append([df[df["label"]==j].index, i, j, len(df[df["label"]==j])])

pd.DataFrame(same_val, columns=["index", "text", "label", "occurence"]).to_csv("Temp/samevalue_difftarget.csv", index=False)

# Conclusion: There's way too much duplicated data

In [92]:
# Check same feature same target
temp=[]
for i, j in enumerate(train_data_cleaned[train_data_cleaned.duplicated()].sort_values("text").values):
    temp.append(f'{j[0]} [{j[1]}]')
temp = set(temp)

sameval_target = []
for i in list(temp):
    pattern = r"\[(.*?)\]"
    target = re.search(pattern, i)[1]
    text = re.sub(pattern, "", i).strip(" ")
    the_index = train_data_cleaned[(train_data_cleaned["label"]== target) &
                   (train_data_cleaned["text"] == text)].index
    for j in range(len(the_index)):
        sameval_target.append([the_index[j], text, target])
              
# pd.DataFrame(sameval_target, columns=["index", "text", "label"]).to_csv("Temp/Same_Value_Target.csv")

# Modelling (Without Handling)

In [245]:
class encoder:
    def __init__(self, list_labels):
        self.encoder = {}
        for count, label in enumerate(np.unique(list_labels)):
            self.encoder[label] = count
    def encode(self, list_labels):
        return [self.encoder[label] for label in list_labels]

In [288]:
train_data_cleaned_encoded = train_data_cleaned_lemarization_stopwords.copy()
encodething = encoder(train_data_cleaned_encoded["label"])
train_data_cleaned_encoded["label"] = encodething.encode(train_data_cleaned_encoded["label"])

In [289]:
# DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [290]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data_cleaned_encoded["text"], train_data_cleaned_encoded["label"], test_size=0.2, random_state=42)

In [291]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

In [292]:
def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [293]:
# def balanced_accuracy(y_true, y_pred):
#     # Convert predictions to class labels
#     y_pred_labels = K.argmax(y_pred, axis=-1)
    
#     # Ensure y_true is the same dtype as y_pred_labels
#     y_true = K.cast(y_true, y_pred_labels.dtype)
    
#     # Initialize variables to accumulate total recall and class counts
#     total_recall = 0.0
#     num_classes = K.max(y_true) + 1  # Assuming class labels are [0, ..., num_classes-1]

#     for i in range(num_classes):
#         true_positives = K.sum(K.cast(K.equal(y_true, i) & K.equal(y_pred_labels, i), dtype=tf.float32))
#         possible_positives = K.sum(K.cast(K.equal(y_true, i), dtype=tf.float32))
        
#         recall = true_positives / (possible_positives + K.epsilon())
#         total_recall += recall
    
#     balanced_accuracy_value = total_recall / tf.cast(num_classes, tf.float32)
#     return balanced_accuracy_value
    
def get_model():
    model = tf.keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.BatchNormalization(),
        
        keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        
        keras.layers.GlobalMaxPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.4),
        
        keras.layers.Dense(8, activation='softmax')
    ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
    return model

In [294]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x22b89c91bd0>

In [295]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



In [296]:
balanced_accuracy_score(val_labels, predicted_class_labels)

0.4522376519306736