In [1]:
import pandas as pd
import re
import string
import nltk
# nltk.download ('all')
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from googletrans.client import Translator
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
data_cleaned = pd.read_csv("output/IKN.csv")
data_cleaned = data_cleaned.dropna(subset=['full_text'])
data_cleaned = data_cleaned.drop_duplicates(subset=['full_text'])
data_cleaned.shape

(3435, 3)

In [3]:
data_cleaned.head(10)
data_cleaned = data_cleaned.drop(columns=['Unnamed: 0'])

# DATA CLEANING

In [4]:
def data_clean(data):
    # Hapus mention (@username) menggunakan regex
    data = re.sub(r'@\w+', '', data)
    # Hapus hashtag (#hashtag) menggunakan rege
    data = re.sub(r'#\w+', '', data)
    # Hapus URL menggunakan regex
    data = re.sub(r'http\S+|www\S+|https\S+', '', data)
    # Hapus emoji menggunakan regex
    data = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F]+', '', data)
    # Hapus semua tanda baca menggunakan regex
    data = data.translate(str.maketrans('', '', string.punctuation))
    return data

data_cleaned['cleaned'] = data_cleaned['full_text'].apply(lambda x: data_clean(x))
data_cleaned.to_csv('cleaned6.csv', index=False)
data_cleaned.head(10)

Unnamed: 0,full_text,created_at,cleaned
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...
5,Usulan DPR tak Ikut Pindah ke IKN Nusantara Di...,Wed Mar 27 03:18:10 +0000 2024,Usulan DPR tak Ikut Pindah ke IKN Nusantara Di...
6,Suasana Proyek Pembangunan Bandara VVIP IKN ap...,Wed Mar 27 02:48:24 +0000 2024,Suasana Proyek Pembangunan Bandara VVIP IKN ap...
7,Investasi di IKN Nusantara Dinilai Tidak Mengu...,Tue Mar 26 23:39:00 +0000 2024,Investasi di IKN Nusantara Dinilai Tidak Mengu...
8,Inilah Hunian ASN 4 yang mana pembangunan nya ...,Tue Mar 26 13:56:24 +0000 2024,Inilah Hunian ASN 4 yang mana pembangunan nya ...
9,Investasi di IKN Nusantara Dinilai Tidak Mengu...,Tue Mar 26 11:47:44 +0000 2024,Investasi di IKN Nusantara Dinilai Tidak Mengu...


# TRANSLATE

In [None]:
translator = Translator()

data_cleaned['translated'] = data_cleaned['cleaned'].str.encode('ascii', 'ignore').apply(translator.translate, src='id', dest='en')
def clean_tweet2(tweet):
  return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

data_cleaned['translated'] = data_cleaned['translated'].apply(getattr, args=('text',))
data_cleaned['translated'] = data_cleaned.apply(lambda x: clean_tweet2(x['translated']), axis=1)
data_cleaned.to_csv('output/data_translated6.csv', index=False)

# TEXT PREPROCESSING

In [2]:
data_cleaned = pd.read_csv('output/data_translated4.csv')
data_cleaned = data_cleaned.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
data_cleaned.head()

Unnamed: 0,full_text,created_at,cleaned,translated
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,b These are the 2 units of Landed Houses for M...
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...,b Alhamdulillah thank you for visiting the Ind...
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,b Observers Criticize 2 Projects Owned by Cong...
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...,b Wow Seen from the side of the road towering ...
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...,b When droned to roll off a PLN transformer at...


# CASE FOLDING

In [3]:
def preprocess_text(text):
  #ubah kalimat jadi huruf kecil
  lower_case = text.lower()
  #hapus angka dari kalimat
  result = re.sub(r"\d+", "", lower_case)
  #hapus tanda baca dari kalimat
  result = result.translate(str.maketrans("","",string.punctuation + "!!"))
  #hapus spasi awal & akhir kalimat
  result = result.strip()
  return result

def clean_tweet2(tweet):
  return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

data_cleaned['case_fold'] = data_cleaned['translated'].apply(lambda x: preprocess_text(x))
data_cleaned['case_fold'] = data_cleaned['case_fold'].apply(lambda x: clean_tweet2(x))
data_cleaned.head()

Unnamed: 0,full_text,created_at,cleaned,translated,case_fold
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,b These are the 2 units of Landed Houses for M...,b these are the units of landed houses for min...
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...,b Alhamdulillah thank you for visiting the Ind...,b alhamdulillah thank you for visiting the ind...
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,b Observers Criticize 2 Projects Owned by Cong...,b observers criticize projects owned by conglo...
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...,b Wow Seen from the side of the road towering ...,b wow seen from the side of the road towering ...
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...,b When droned to roll off a PLN transformer at...,b when droned to roll off a pln transformer at...


# TOKENIZING

In [4]:
def tokenize_text(text):
  tokens = word_tokenize(text)
  return tokens

data_cleaned['token'] = data_cleaned['case_fold'].apply(lambda x: tokenize_text(x))
data_cleaned.head()

Unnamed: 0,full_text,created_at,cleaned,translated,case_fold,token
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,b These are the 2 units of Landed Houses for M...,b these are the units of landed houses for min...,"[b, these, are, the, units, of, landed, houses..."
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...,b Alhamdulillah thank you for visiting the Ind...,b alhamdulillah thank you for visiting the ind...,"[b, alhamdulillah, thank, you, for, visiting, ..."
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,b Observers Criticize 2 Projects Owned by Cong...,b observers criticize projects owned by conglo...,"[b, observers, criticize, projects, owned, by,..."
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...,b Wow Seen from the side of the road towering ...,b wow seen from the side of the road towering ...,"[b, wow, seen, from, the, side, of, the, road,..."
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...,b When droned to roll off a PLN transformer at...,b when droned to roll off a pln transformer at...,"[b, when, droned, to, roll, off, a, pln, trans..."


# FILTERING (STOPWORD REMOVAL)

In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
stopwords_set = set(stopwords.words('english'))
def stopword_text(tokens):
  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords_set:
      cleaned_tokens.append(token)
  return cleaned_tokens

data_cleaned['stop'] = data_cleaned['token'].apply(lambda x: stopword_text(x))
data_cleaned.head()

Unnamed: 0,full_text,created_at,cleaned,translated,case_fold,token,stop
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,b These are the 2 units of Landed Houses for M...,b these are the units of landed houses for min...,"[b, these, are, the, units, of, landed, houses...","[b, units, landed, houses, ministerial, positi..."
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...,b Alhamdulillah thank you for visiting the Ind...,b alhamdulillah thank you for visiting the ind...,"[b, alhamdulillah, thank, you, for, visiting, ...","[b, alhamdulillah, thank, visiting, indonesian..."
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,b Observers Criticize 2 Projects Owned by Cong...,b observers criticize projects owned by conglo...,"[b, observers, criticize, projects, owned, by,...","[b, observers, criticize, projects, owned, con..."
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...,b Wow Seen from the side of the road towering ...,b wow seen from the side of the road towering ...,"[b, wow, seen, from, the, side, of, the, road,...","[b, wow, seen, side, road, towering, buildings..."
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...,b When droned to roll off a PLN transformer at...,b when droned to roll off a pln transformer at...,"[b, when, droned, to, roll, off, a, pln, trans...","[b, droned, roll, pln, transformer, east, kali..."


# LEMMATIZATION

In [7]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(tokens):
    result = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return result

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

data_cleaned['lemmatized'] = data_cleaned['stop'].apply(lambda x: lemmatize_text(x))
data_cleaned.head()

Unnamed: 0,full_text,created_at,cleaned,translated,case_fold,token,stop,lemmatized
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,b These are the 2 units of Landed Houses for M...,b these are the units of landed houses for min...,"[b, these, are, the, units, of, landed, houses...","[b, units, landed, houses, ministerial, positi...","[b, unit, land, house, ministerial, position, ..."
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...,b Alhamdulillah thank you for visiting the Ind...,b alhamdulillah thank you for visiting the ind...,"[b, alhamdulillah, thank, you, for, visiting, ...","[b, alhamdulillah, thank, visiting, indonesian...","[b, alhamdulillah, thank, visit, indonesian, o..."
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,b Observers Criticize 2 Projects Owned by Cong...,b observers criticize projects owned by conglo...,"[b, observers, criticize, projects, owned, by,...","[b, observers, criticize, projects, owned, con...","[b, observer, criticize, project, own, conglom..."
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...,b Wow Seen from the side of the road towering ...,b wow seen from the side of the road towering ...,"[b, wow, seen, from, the, side, of, the, road,...","[b, wow, seen, side, road, towering, buildings...","[b, wow, see, side, road, tower, building, are..."
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...,b When droned to roll off a PLN transformer at...,b when droned to roll off a pln transformer at...,"[b, when, droned, to, roll, off, a, pln, trans...","[b, droned, roll, pln, transformer, east, kali...","[b, drone, roll, pln, transformer, east, kalim..."


In [8]:
data_cleaned.to_csv('output/preprocessed_data7.csv', index=False)

In [9]:
data_cleaned = pd.read_csv('output/preprocessed_data7.csv')
data_cleaned.head()

Unnamed: 0,full_text,created_at,cleaned,translated,case_fold,token,stop,lemmatized
0,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,Fri Mar 29 11:48:49 +0000 2024,Inilah 2 unit Rumah Tapak Jabatan Menteri yang...,b These are the 2 units of Landed Houses for M...,b these are the units of landed houses for min...,"['b', 'these', 'are', 'the', 'units', 'of', 'l...","['b', 'units', 'landed', 'houses', 'ministeria...","['b', 'unit', 'land', 'house', 'ministerial', ..."
1,Alhamdulillah terima kasih berkenan mengunjung...,Fri Mar 29 02:34:41 +0000 2024,Alhamdulillah terima kasih berkenan mengunjung...,b Alhamdulillah thank you for visiting the Ind...,b alhamdulillah thank you for visiting the ind...,"['b', 'alhamdulillah', 'thank', 'you', 'for', ...","['b', 'alhamdulillah', 'thank', 'visiting', 'i...","['b', 'alhamdulillah', 'thank', 'visit', 'indo..."
2,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,Thu Mar 28 23:39:00 +0000 2024,Pengamat Kritik 2 Proyek Milik Konglomerat Pen...,b Observers Criticize 2 Projects Owned by Cong...,b observers criticize projects owned by conglo...,"['b', 'observers', 'criticize', 'projects', 'o...","['b', 'observers', 'criticize', 'projects', 'o...","['b', 'observer', 'criticize', 'project', 'own..."
3,Wow‼️Terlihat dari pinggir jalan Gedung-gedung...,Thu Mar 28 06:04:01 +0000 2024,Wow‼️Terlihat dari pinggir jalan Gedunggedung ...,b Wow Seen from the side of the road towering ...,b wow seen from the side of the road towering ...,"['b', 'wow', 'seen', 'from', 'the', 'side', 'o...","['b', 'wow', 'seen', 'side', 'road', 'towering...","['b', 'wow', 'see', 'side', 'road', 'tower', '..."
4,Waktu ngedrone roll off trafo PLN di pelabuhan...,Thu Mar 28 03:03:32 +0000 2024,Waktu ngedrone roll off trafo PLN di pelabuhan...,b When droned to roll off a PLN transformer at...,b when droned to roll off a pln transformer at...,"['b', 'when', 'droned', 'to', 'roll', 'off', '...","['b', 'droned', 'roll', 'pln', 'transformer', ...","['b', 'drone', 'roll', 'pln', 'transformer', '..."
