In [213]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from category_encoders import OrdinalEncoder
import os
from tqdm import tqdm
import sys

from sklearn.model_selection import train_test_split
import re

In [2]:
def create_submission(predicted, path = "submission.csv"):
    folder_loc = '/'.join([i for i in path.split("/")][:-1])
    if not os.path.exists(folder_loc) and folder_loc != '':
        os.makedirs(folder_loc)
    df = pd.read_excel("Data/Submission_Format.xlsx")
    df["Kelas"] = predicted
    df.to_csv(path, index=False)

In [3]:
# Was run on
print(f'PY version   : {sys.version}\nHardware     : {tf.config.list_physical_devices()[0]}')

PY version   : 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Hardware     : PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


# Wrangling

In [4]:
# input Data
train_data = pd.read_excel("Data/train.xlsx")
test_data = pd.read_excel("Data/Test.xlsx")

## Exploration

In [5]:
# Check for Dupllicate
sum(train_data.duplicated())        # 381

# Further examination
temp = train_data[train_data.duplicated()].copy()
for counter, text in enumerate(temp.sort_values("text").values):
    print(text)
    if counter==10: break

# Conclusion: Ada duplikat pada train data, ada yang fitur dan targetnya sama. Namun, ada juga yang fiturnya sama namun targetnya berbeda

['@gQ+QGmYJ209N7Py+H3gRyakQiic4NLEklTOIIuALnZA= @gqAL2HIcdWKR2U/VFUq3R0TFxXxtxCsKyUXAKn9R5o0= Iya nih penting suara Batak sebagai populasi etnis ketiga terbesar setelah Jawa Sunda. Mayoritas Batak Kristen cukup anti sama Anies karena diframing radikal sejak'
 'Demografi']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginform

In [6]:
# Check imbalanced Data
temp = {}
targets = list(np.unique(train_data["label"].values))

for i in targets:
    temp[i] = len(train_data[train_data["label"]==i])
temp

# Conclusion: Data's unbalanced

{'Demografi': 62,
 'Ekonomi': 367,
 'Geografi': 20,
 'Ideologi': 400,
 'Pertahanan dan Keamanan': 400,
 'Politik': 2972,
 'Sosial Budaya': 587,
 'Sumber Daya Alam': 192}

In [7]:
# Find the most length text
counter=[]
for i in train_data["text"].values:
    counter.append(len(i))
sorted(list(zip([i for i in range(len(train_data))], train_data["text"].values, train_data["label"].values,counter)), key = lambda x: x[3], reverse=True)[:10]

# Conclusion: Data needs to be cleaned

[(688,
  'RT ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚â€™ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å“ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¤ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â§ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â£ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â©ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚â€ºÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â°ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9dÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¸ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9d ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¬ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\xadÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\xadÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ 0,7 ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9dÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ 100 Food estate mulai dicanangkan pada 3 tahun lalu, tepatnya pada 6 Juli 2020 ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x8fÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¬ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã

What needs to be cleaned:
1. RT
2. TAG
3. LINK
4. Reply
5. UTF-8 encoded and emoji
6. Number, comma, dot, etc etc
7. Kata dengan 1 huruf

## Cleaning

In [18]:
def cleantext(text: str,
              case1:  bool = True,
              case2: bool = False):
    '''
    returns cleaned string from messy raw string that's unreadable for the model.
    
    Case1: Removes UTF-8 encoded, number, comma, dot, etc. But keeps number on hastag
    Case2: This removes hastag entirely
    '''
    if case1:
        # Removes '&amp;' from text, html stuff.
        text = re.sub(r'&amp;', '', text)
        text = re.sub(r' +'," ", ''.join(re.findall(r'[ a-zA-Z@0-9[\]#]',text)))
                
        # Split input string into parts: hashtags and non-hashtag parts
        parts = re.split(r'(#\w*)', text)

        # Pattern to match hashtags
        hashtag_pattern = re.compile(r'#\w*')
        # Apply digit replacement to non-hashtag parts
        processed_parts = [
            part if hashtag_pattern.match(part) else re.sub(r'\d', '', part)
            for part in parts
        ]
        # Reconstruct the string
        text = ''.join(processed_parts)
    
    # This removes 'RT ' from the text as the first word.
    text =re.sub(r'RT ', '', text)
    # Change text into lowercase
    text = text = text.lower()
    # This removes link https
    text = re.sub(r'http\S+', "", text)
    # Removes Tags  @xxxx
    text = re.sub(r'@\S+ ',"", text)
    # Removes word with 1 char
    text = re.sub(r' \w ', " ", text)
    # Removes Reply [re xxxx]
    text = re.sub(r' \[re \w+]',"", text)
    
    if case2: 
        text = re.sub(r'#\S+ ', '', text)
    
    return re.sub(" +", " ", text).strip(" ")

# a = train_data["text"][4]
# cleantext(a)

In [22]:
# Get cleaned data
train_data_cleaned = train_data.copy()
train_data_cleaned["text"] = [cleantext(i) for i in train_data_cleaned["text"]]
train_data_cleaned["text_uncleaned"] = train_data["text"]

# For further examination
#train_data_cleaned.to_csv("Temp/cleaned_data.csv")
train_data_cleaned = train_data_cleaned.drop("text_uncleaned", axis=1)

## Handling

In [180]:
# Same value diff target

same_val = []
for i in np.unique(train_data_cleaned["text"].values):
    if (count:= len(train_data_cleaned[train_data_cleaned["text"] == i])) > 1:
        df = train_data_cleaned[train_data_cleaned["text"]== i]
        if len(unique_label:= df["label"].unique()) >1:
            for j in unique_label:
                same_val.append([df[df["label"]==j].index, i, j, len(df[df["label"]==j])])

pd.DataFrame(same_val, columns=["index", "text", "label", "occurence"]).to_csv("Temp/samevalue_difftarget.csv", index=False)

# Conclusion: There's way too much duplicated data

In [92]:
# Check same feature same target
temp=[]
for i, j in enumerate(train_data_cleaned[train_data_cleaned.duplicated()].sort_values("text").values):
    temp.append(f'{j[0]} [{j[1]}]')
temp = set(temp)

sameval_target = []
for i in list(temp):
    pattern = r"\[(.*?)\]"
    target = re.search(pattern, i)[1]
    text = re.sub(pattern, "", i).strip(" ")
    the_index = train_data_cleaned[(train_data_cleaned["label"]== target) &
                   (train_data_cleaned["text"] == text)].index
    for j in range(len(the_index)):
        sameval_target.append([the_index[j], text, target])
              
# pd.DataFrame(sameval_target, columns=["index", "text", "label"]).to_csv("Temp/Same_Value_Target.csv")

# Modelling (Without Handling)

In [327]:
train_data_cleaned_encoded = train_data_cleaned.copy()

encoder = OrdinalEncoder(cols="label")
encoder.fit(train_data_cleaned_encoded["label"])
train_data_cleaned_encoded["label"] = encoder.transform(train_data_cleaned_encoded["label"])
encoder.mapping

[{'col': 'label',
  'mapping': Sumber Daya Alam           1
  Politik                    2
  Demografi                  3
  Pertahanan dan Keamanan    4
  Ideologi                   5
  Ekonomi                    6
  Sosial Budaya              7
  Geografi                   8
  NaN                       -2
  dtype: int64,
  'data_type': dtype('O')}]

In [259]:
# DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [316]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data_cleaned_encoded["text"], train_data_cleaned_encoded["label"], test_size=0.2, random_state=42)

In [317]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

In [324]:
def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [329]:
def get_model():
    model = tf.keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.BatchNormalization(),
        
        keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        
        keras.layers.GlobalMaxPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.4),
        
        keras.layers.Dense(9, activation='softmax')
    ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss=keras.losses.SparseCategoricalCrossentropy(), metrics="accuracy")
    return model

In [330]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
 20/125 [===>..........................] - ETA: 7s - loss: 0.1145 - accuracy: 0.9656

KeyboardInterrupt: 