In [None]:
import pandas as pd
import re

# Data Preparation

In [None]:
src = ""

In [None]:
total = pd.read_csv(src, index_col=0)

In [None]:
new_df = {
    "full_text" : total.full_text,
}

new_df = pd.DataFrame(new_df)

new_df.head()

In [None]:
len(new_df)

33

# Data Cleaning

In [None]:
def remove_urls(text):
    # Define the regex pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'

    # Use the sub() function to replace URLs with an empty string
    return re.sub(url_pattern, '', text)

In [None]:
def remove_tags(text):
    # Define the regex pattern to match tags
    tags_pattern = r'#\S+'

    # Use the sub() function to replace tags with an empty string
    return re.sub(tags_pattern, '', text)

In [None]:
def remove_indonesian_phone_numbers(text):
    # Regex pattern to match Indonesian phone numbers
    pattern = r"(\+62[\s.-]?\d{2,3}[\s.-]?\d{3,4}[\s.-]?\d{3,4}|08[\s.-]?\d{1,2}[\s.-]?\d{3,5}[\s.-]?\d{3,5})"
    return re.sub(pattern, "", text).strip()

In [None]:
def remove_end_hashtags(tweet):
    # This regex finds hashtags at the end of the tweet
    return re.sub(r"(?:\s+#\w+)+$", "", tweet)

In [None]:
def remove_mentions(text):
    # Define the regex pattern to match mentions
    mention_pattern = r'@\S+'

    # Use the sub() function to replace mentions with an empty string
    return re.sub(mention_pattern, '', text)

In [None]:
new_df['full_text'] = new_df['full_text'].apply(remove_urls).apply(remove_tags).apply(remove_indonesian_phone_numbers)

new_df.reset_index(drop=True, inplace=True)

#HTML Parsing

In [None]:
dataset = pd.DataFrame()

In [None]:
dataset = pd.concat([dataset, new_df])

In [None]:
import html

# Convert HTML entities back to their characters
dataset['full_text'] = dataset['full_text'].apply(html.unescape)

#Tokenizzation

In [None]:
dataset["full_text"] = dataset["full_text"].str.lower()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
TweetTokenizer = TweetTokenizer()

In [None]:
tokenized_tweets = [TweetTokenizer.tokenize(tweet) for tweet in dataset['full_text']]
tokenized_tweets

[['minsan',
  'talaga',
  'mak',
  ',',
  'dahil',
  'sayo',
  'nagiging',
  'straight',
  'ea',
  'na',
  'aq',
  'hahahahahaha',
  '😭'],
 ['presuntos',
  'integrantes',
  'de',
  'los',
  'chimalis',
  ',',
  'antes',
  'de',
  'que',
  'los',
  'renunciara',
  'la',
  'gente',
  'del',
  'mayito',
  'flaco'],
 ['kini', 'pero', 'daghan', 'lang', 'pil', 'unon', '😌', '🤸\u200d♀', '️', '✨'],
 ['putabginang',
  'ampogi',
  'gago',
  'wag',
  'na',
  'yung',
  'yumburger',
  'ikaw',
  'na',
  'lang',
  'pil'],
 ['det',
  'kan',
  'du',
  'nok',
  'skyte',
  'en',
  'lang',
  'pil',
  'etter',
  '.',
  'jeg',
  'har',
  'ikke',
  'sett',
  'frode',
  'prestere',
  'å',
  'legge',
  'frem',
  'ett',
  'eneste',
  'grunnlag',
  'for',
  'påstandene',
  'sine',
  'noen',
  'gang',
  '.'],
 ['aufmachen',
  ',',
  'je',
  'hebt',
  'iemand',
  'online',
  '‘',
  'beledigd',
  '’',
  '.',
  'wat',
  'glijdt',
  'dit',
  'land',
  'af',
  'zeg',
  '.',
  'idioten',
  'bij',
  'het',
  'om',
  'en'

In [None]:
tokenized_df = pd.DataFrame({
    'tokens': tokenized_tweets
})
# tokenized_df.to_csv('/content/drive/MyDrive/Datasets/Skripsi/Indonesian_Language_Tweets/Weekly/Dataset_v2/locations_tokenized.csv', index=False)

In [None]:
tokenized_df = pd.read_csv('/content/drive/MyDrive/Datasets/Skripsi/Indonesian_Language_Tweets/Weekly/Dataset_v2/Tokenized/indonesian_language_tweets(top)_tokenized.csv')

In [None]:
tokenized_df

Unnamed: 0,tokens
0,"[kes, di, family, mart, angsana, mall, johor, ..."
1,"[ngw, di, toilet, mall]"
2,"[muscat, massage, full, service, muscat, ., sa..."
3,"[muscat, massage, full, service, muscat, ., sa..."
4,"[[, absen, |, 17, mar, ], sudah, dapat, promo,..."
...,...
607,"[tiket, final, piala, malaysia, 2025, sudah, m..."
608,"[@mwabilimwagodi, kesho, tunawaomba, msijaribu..."
609,"[siap, meng-abracadabra, kan, singapore, stadi..."
610,"[dr, ysr, aca, -, vdca, international, stadium..."


#Slang words

In [None]:
import json

with open('/content/drive/MyDrive/Datasets/Skripsi/slang_words.json', 'r') as f:
    slang_dict = json.load(f)

def normalize_text(tokens):
    normalized_tokens = []
    for token in tokens:
        # Retrieve the replacement from the dictionary; if not found, keep the original token
        replacement = slang_dict.get(token, token)
        # If the replacement is a string, split it into words and extend the token list
        if isinstance(replacement, str):
            normalized_tokens.extend(replacement.split())
        else:
            normalized_tokens.append(replacement)
    return normalized_tokens


In [None]:
tokenized_df['tokens'] = tokenized_df['tokens'].apply(normalize_text)

# Drop Duplicates

In [None]:
nondupe = tokenized_df.drop_duplicates()

In [None]:
nondupe.to_csv('/content/added_data.csv', index=False)