In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import os
from tqdm import tqdm
import sys

import re

In [2]:
def create_submission(predicted, path = "submission.csv"):
    folder_loc = '/'.join([i for i in path.split("/")][:-1])
    if not os.path.exists(folder_loc) and folder_loc != '':
        os.makedirs(folder_loc)
    df = pd.read_excel("Data/Submission_Format.xlsx")
    df["Kelas"] = predicted
    df.to_csv(path, index=False)

In [3]:
# Was run on
print(f'PY version   : {sys.version}\nHardware     : {tf.config.list_physical_devices()[0]}')

PY version   : 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Hardware     : PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


# Wrangling

In [4]:
# input Data
train_data = pd.read_excel("Data/train.xlsx")
test_data = pd.read_excel("Data/Test.xlsx")

## Exploration

In [5]:
# Check for Dupllicate
sum(train_data.duplicated())        # 381

# Further examination
temp = train_data[train_data.duplicated()].copy()
for counter, text in enumerate(temp.sort_values("text").values):
    print(text)
    if counter==10: break

# Conclusion: Ada duplikat pada train data, ada yang fitur dan targetnya sama. Namun, ada juga yang fiturnya sama namun targetnya berbeda

['@gQ+QGmYJ209N7Py+H3gRyakQiic4NLEklTOIIuALnZA= @gqAL2HIcdWKR2U/VFUq3R0TFxXxtxCsKyUXAKn9R5o0= Iya nih penting suara Batak sebagai populasi etnis ketiga terbesar setelah Jawa Sunda. Mayoritas Batak Kristen cukup anti sama Anies karena diframing radikal sejak'
 'Demografi']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginformasikan bahwa isu utama di kampung mereka adalah sulitnya mendapatkan air bersih dan masalah naiknya air laut. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto'
 'Sumber Daya Alam']
['Adi menginform

In [6]:
# Check imbalanced Data
temp = {}
targets = list(np.unique(train_data["label"].values))

for i in targets:
    temp[i] = len(train_data[train_data["label"]==i])
temp

# Conclusion: Data's unbalanced

{'Demografi': 62,
 'Ekonomi': 367,
 'Geografi': 20,
 'Ideologi': 400,
 'Pertahanan dan Keamanan': 400,
 'Politik': 2972,
 'Sosial Budaya': 587,
 'Sumber Daya Alam': 192}

In [7]:
# Find the most length text
counter=[]
for i in train_data["text"].values:
    counter.append(len(i))
sorted(list(zip([i for i in range(len(train_data))], train_data["text"].values, train_data["label"].values,counter)), key = lambda x: x[3], reverse=True)[:10]

# Conclusion: Data needs to be cleaned

[(688,
  'RT ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚â€™ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å“ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¤ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â§ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â£ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â©ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚â€ºÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â°ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9dÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¸ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¨ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9d ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¬ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\xadÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\xadÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ 0,7 ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x9dÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¡ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¢ 100 Food estate mulai dicanangkan pada 3 tahun lalu, tepatnya pada 6 Juli 2020 ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â\x8fÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â«ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Å¾ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã‚Â¬ÃƒÂ°Ã‚Â\x9dÃ‚Â\x90Ã

What needs to be cleaned:
1. RT
2. TAG
3. LINK
4. Reply
5. UTF-8 encoded and emoji
6. Number, comma, dot, etc etc
7. Kata dengan 1 huruf

## Cleaning

In [18]:
def cleantext(text: str,
              case1:  bool = True,
              case2: bool = False):
    '''
    returns cleaned string from messy raw string that's unreadable for the model.
    
    Case1: Removes UTF-8 encoded, number, comma, dot, etc. But keeps number on hastag
    Case2: This removes hastag entirely
    '''
    if case1:
        # Removes '&amp;' from text, html stuff.
        text = re.sub(r'&amp;', '', text)
        text = re.sub(r' +'," ", ''.join(re.findall(r'[ a-zA-Z@0-9[\]#]',text)))
                
        # Split input string into parts: hashtags and non-hashtag parts
        parts = re.split(r'(#\w*)', text)

        # Pattern to match hashtags
        hashtag_pattern = re.compile(r'#\w*')
        # Apply digit replacement to non-hashtag parts
        processed_parts = [
            part if hashtag_pattern.match(part) else re.sub(r'\d', '', part)
            for part in parts
        ]
        # Reconstruct the string
        text = ''.join(processed_parts)
    
    # This removes 'RT ' from the text as the first word.
    text =re.sub(r'RT ', '', text)
    # Change text into lowercase
    text = text = text.lower()
    # This removes link https
    text = re.sub(r'http\S+', "", text)
    # Removes Tags  @xxxx
    text = re.sub(r'@\S+ ',"", text)
    # Removes word with 1 char
    text = re.sub(r' \w ', " ", text)
    # Removes Reply [re xxxx]
    text = re.sub(r' \[re \w+]',"", text)
    
    if case2: 
        text = re.sub(r'#\S+ ', '', text)
    
    return re.sub(" +", " ", text).strip(" ")

# a = train_data["text"][4]
# cleantext(a)

In [22]:
# Get cleaned data
train_data_cleaned = train_data.copy()
train_data_cleaned["text"] = [cleantext(i) for i in train_data_cleaned["text"]]
train_data_cleaned["text_uncleaned"] = train_data["text"]

# For further examination
#train_data_cleaned.to_csv("Temp/cleaned_data.csv")
train_data_cleaned = train_data_cleaned.drop("text_uncleaned", axis=1)

## Handling

In [92]:
# Check same feature same target
temp=[]
for i, j in enumerate(train_data_cleaned[train_data_cleaned.duplicated()].sort_values("text").values):
    temp.append(f'{j[0]} [{j[1]}]')
temp = set(temp)

sameval_target = []
for i in list(temp):
    pattern = r"\[(.*?)\]"
    target = re.search(pattern, i)[1]
    text = re.sub(pattern, "", i).strip(" ")
    the_index = train_data_cleaned[(train_data_cleaned["label"]== target) &
                   (train_data_cleaned["text"] == text)].index
    for j in range(len(the_index)):
        sameval_target.append([the_index[j], text, target])
              
# pd.DataFrame(sameval_target, columns=["index", "text", "label"]).to_csv("Temp/Same_Value_Target.csv")