### Import libraries

In [1]:
!pip install textblob wordcloud Sastrawi deep_translator



In [2]:
import pandas as pd
import re
from textblob import TextBlob
from wordcloud import WordCloud
import numpy as np
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from deep_translator import GoogleTranslator
import time
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fadhinotgr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fadhinotgr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/fadhinotgr/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Get the data

In [3]:
df_articles = pd.read_csv("data/articles_raw.csv")
df_articles.head()

Unnamed: 0,Kategori,Link,article_title,article_text
0,Finance,https://industri.kontan.co.id/news/pemerintah-...,"Pemerintah Resmi Turunkan Harga Tiket Pesawat,...",Reporter: Leni Wandira | Editor: Wahyu T.Rahma...
1,Incident,https://news.okezone.com/read/2023/10/29/337/2...,Peristiwa 29 Oktober : Pesawat Lion Air Jatuh ...,SEJUMLAH peristiwa terjadi pada 29 Oktober. Sa...
2,Incident,https://www.cnnindonesia.com/ekonomi/202306081...,Deret Masalah Penerbangan Lion Air Group Sepan...,Sekretaris Umum PP Muhammadiyah Abdul Mu'ti me...
3,Incident,https://www.antaranews.com/video/3366141/ini-d...,Ini dugaan penyebab kecelakaan pesawat Lion Ai...,Copyright © ANTARA 2023\nDilarang keras mengam...
4,Operational,https://www.cnbcindonesia.com/news/20220109071...,"Mulai Januari, Lion Air Terbang Jakarta-Madina...","Jakarta, CNBC Indonesia - Maskapai Lion Air me..."


In [5]:
get_links = pd.read_csv("https://docs.google.com/spreadsheets/d/1nE_v4AFkLM76TdWcAbU1HRWgQ-gGKYa9ZMBbm7MM0V0/export?gid=0&format=csv")
df_articles['sentiment_label'] = get_links['Label Sentimen']

### Clear up article text

In [8]:
def get_domain(text:str):
    x = text.replace("https://", "")
    x = x.split("/")[0]
    x = re.sub(r"www\.|\.com|\.co\.id|\.id|\.co|\.go\.id|\.asia", "", x)
    x = x if len(x.split(".")) < 2 else x.split(".")[1]
    return x


df_articles['domain'] = df_articles['Link'].apply(get_domain)

In [9]:
def cleanup_article(text):
    contains = ["Cek Berita dan Artikel yang",
                "Baca Juga\s?:",
                "Reporter:",
                "Sumber:",
                "Nyaman tanpa iklan. ",
                "Ringkasan ini dibantu dengan menggunakan AI",
                "SCROLL TO CONTINUE WITH CONTENT",
                "Sumber gambar, ",
                "BELUM ADA KOMENTAR",
                "DOKUMENTASI GAMBAR BELUM TERSEDIA",
                "Telp. ",
                "Hak Cipta ©",
                "Cobain For You Page",
                "ADVERTISEMENT",
                "Comment *",
                "Name *",
                "Email *",
                "Baca berita dengan sedikit iklan",
                "All Rights Reserved",
                "Sekam Api Reformasi Polri",
                "Scroll ke bawah",
                "Jangan lupa klik di sini",
                "Gambas:Video",
                "Copyright ©",
                "(Lihat|Simak) juga Video:",
                "Simak juga '",
                "Editor : ",
                "News dan WA Channel",
                "CLICK HERE!",
                "© 2025 BBC",
                "Gabung Tempo Circle",
                "Home » Investasi",
                "(evs)",
                "Lihat Video ",
                "Baca:\xa0",
                "Mau notif berita penting & breaking news",
                "Jelajahi info seputar haji",
                "Jurnalis :",
                "PODCAST REKOMENDASI TEMPO ",
                "© 2023 ",
                "Copyright RRI.co.id.",
                "Komentar menjadi tanggung-jawab Anda sesuai UU ITE.",
                "Penulis: "
                ]
    for c in contains: text = re.sub(rf"(\n|\b).*{c}.*(\n|\b)", "\n", text, flags=re.IGNORECASE)
    return text
df_articles['cleaned_article_text'] = df_articles['article_text'].apply(cleanup_article)


  "Baca Juga\s?:",


In [10]:
def clean_space(text:str):
    text_split = text.split("\n")
    without_space_split = list(filter(lambda x : x.count(" ") > 2, text_split))
    without_space_split = list(map(lambda x : re.sub(r"\s+", " ", x), without_space_split))
    cleaned_join = "\n".join(without_space_split)
    return cleaned_join
df_articles['cleaned_article_text'] = df_articles['cleaned_article_text'].apply(clean_space)

In [11]:
#Evaluation

import random
itg = random.randint(0, 152)
df_articles.loc[itg, "article_text"].split("\n"), df_articles.loc[itg, "cleaned_article_text"].split("\n")

with open("article_text.txt", "w") as file:
    file.write(df_articles.loc[itg, "article_text"])

with open("cleaned_article_text.txt", "w")as file:
    file.write(df_articles.loc[itg, "cleaned_article_text"])

### Data Preparation

In [12]:
df_articles.dropna(subset=['cleaned_article_text'], inplace=True)
df_articles['cleaned_article_text'] = df_articles['cleaned_article_text'].astype(str)

print(df_articles.head())

      Kategori                                               Link  \
0      Finance  https://industri.kontan.co.id/news/pemerintah-...   
1     Incident  https://news.okezone.com/read/2023/10/29/337/2...   
2     Incident  https://www.cnnindonesia.com/ekonomi/202306081...   
3     Incident  https://www.antaranews.com/video/3366141/ini-d...   
4  Operational  https://www.cnbcindonesia.com/news/20220109071...   

                                       article_title  \
0  Pemerintah Resmi Turunkan Harga Tiket Pesawat,...   
1  Peristiwa 29 Oktober : Pesawat Lion Air Jatuh ...   
2  Deret Masalah Penerbangan Lion Air Group Sepan...   
3  Ini dugaan penyebab kecelakaan pesawat Lion Ai...   
4  Mulai Januari, Lion Air Terbang Jakarta-Madina...   

                                        article_text sentiment_label  \
0  Reporter: Leni Wandira | Editor: Wahyu T.Rahma...         Positif   
1  SEJUMLAH peristiwa terjadi pada 29 Oktober. Sa...         Negatif   
2  Sekretaris Umum PP Muhammadiy

### Data Cleaning

In [13]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Hapus URL
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Hapus Mention
    text = re.sub(r'@\w+', '', text)
    # Hapus Angka
    text = re.sub(r'\d+', '', text)
    # Hapus Tanda Baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [14]:
df_articles['cleaned_article_text'] = df_articles['cleaned_article_text'].apply(clean_text)

### Sentiment Polarity

In [17]:
def analyze_long_text_sentiment(text):
    # Jika teksnya pendek, langsung proses
    if len(text) < 5000:
        try:
            translated_text = GoogleTranslator(source='id', target='en').translate(text)
            blob = TextBlob(translated_text)
            return pd.Series([blob.sentiment.polarity, blob.sentiment.subjectivity])
        except Exception as e:
            print(f"Terjadi error pada teks pendek: {e}")
            return pd.Series([None, None])

    # Jika teks panjang, pecah menjadi beberapa bagian
    else:
        try:
            max_chunk_size = 4500
            chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]

            translated_chunks = []
            for i, chunk in enumerate(chunks):
                time.sleep(0.5)
                translated_chunks.append(
                    GoogleTranslator(source='id', target='en').translate(chunk)
                )

            # Gabungkan kembali semua hasil terjemahan
            full_translated_text = ' '.join(translated_chunks)

            # Analisis sentimen dari teks lengkap yang sudah diterjemahkan
            blob = TextBlob(full_translated_text)
            return pd.Series([blob.sentiment.polarity, blob.sentiment.subjectivity])

        except Exception as e:
            print(f"Terjadi error pada saat memproses teks panjang: {e}")
            return pd.Series([None, None])
df_articles[['polarity', 'subjectivity']] = df_articles['cleaned_article_text'].apply(analyze_long_text_sentiment)

print(df_articles[['cleaned_article_text', 'polarity', 'subjectivity']].head())

                                cleaned_article_text  polarity  subjectivity
0  kontancoid  jakarta lion group mendukung penuh...  0.089297      0.302210
1  sejumlah peristiwa terjadi pada  oktober salah...  0.163492      0.307937
2  sekretaris umum pp muhammadiyah abdul muti men...  0.073193      0.327077
3  dilarang keras mengambil konten melakukan craw...  0.000000      0.000000
4  jakarta cnbc indonesia  maskapai lion air memu...  0.038542      0.263141


### tokenize

In [15]:
def tokenize_text(text):
    return word_tokenize(text)

In [16]:
df_articles['tokens'] = df_articles['cleaned_article_text'].apply(tokenize_text)

### Stopwords Removal

In [18]:
list_stopwords = stopwords.words('indonesian')

def remove_stopwords(tokens, stopwords_list):
    return [word for word in tokens if word not in stopwords_list]

In [19]:
df_articles['tokens_no_stop'] = df_articles['tokens'].apply(lambda x: remove_stopwords(x, list_stopwords))

In [20]:
# Hapus stopwords default
print("\nDataFrame setelah cleaning, tokenisasi, dan penghapusan stopword awal:")
print(df_articles[['cleaned_article_text', 'tokens_no_stop']].head())

all_tokens = [word for tokens in df_articles['tokens_no_stop'] for word in tokens]

from nltk.probability import FreqDist
freq_dist = FreqDist(all_tokens)

print("\n20 Kata Paling Umum:")
print(freq_dist.most_common(20))


DataFrame setelah cleaning, tokenisasi, dan penghapusan stopword awal:
                                cleaned_article_text  \
0  kontancoid  jakarta lion group mendukung penuh...   
1  sejumlah peristiwa terjadi pada  oktober salah...   
2  sekretaris umum pp muhammadiyah abdul muti men...   
3  dilarang keras mengambil konten melakukan craw...   
4  jakarta cnbc indonesia  maskapai lion air memu...   

                                      tokens_no_stop  
0  [kontancoid, jakarta, lion, group, mendukung, ...  
1  [peristiwa, oktober, salah, satunya, jatuhnya,...  
2  [sekretaris, pp, muhammadiyah, abdul, muti, me...  
3  [dilarang, keras, mengambil, konten, crawling,...  
4  [jakarta, cnbc, indonesia, maskapai, lion, air...  

20 Kata Paling Umum:
[('pesawat', 1046), ('air', 947), ('lion', 929), ('penerbangan', 615), ('penumpang', 551), ('bandara', 442), ('danang', 241), ('jt', 206), ('udara', 201), ('maskapai', 201), ('rute', 169), ('desember', 164), ('pilot', 146), ('bagasi', 145)

### Stemming

In [21]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(tokens):
    return [stemmer.stem(word) for word in tokens]

In [22]:
custom_stopwords = ["dan",
                "yang",
                "di",
                "untuk",
                "dengan"]


In [23]:

final_stopwords = list_stopwords + custom_stopwords

df_articles['tokens_final'] = df_articles['tokens'].apply(lambda x: remove_stopwords(x, final_stopwords))

df_articles['tokens_stemmed'] = df_articles['tokens_final'].apply(stem_text)

print("\nDataFrame Hasil Akhir Setelah Custom Stopword & Stemming:")
# Tampilkan kolom-kolom relevan untuk perbandingan
print(df_articles[['cleaned_article_text', 'tokens_stemmed']].head())

# Anda juga bisa menyimpan hasil akhir sebagai string jika diperlukan
df_articles['text_final'] = df_articles['tokens_stemmed'].apply(lambda x: ' '.join(x))

print("\nKolom Teks Final (String):")
print(df_articles[['cleaned_article_text', 'text_final']].head())


DataFrame Hasil Akhir Setelah Custom Stopword & Stemming:
                                cleaned_article_text  \
0  kontancoid  jakarta lion group mendukung penuh...   
1  sejumlah peristiwa terjadi pada  oktober salah...   
2  sekretaris umum pp muhammadiyah abdul muti men...   
3  dilarang keras mengambil konten melakukan craw...   
4  jakarta cnbc indonesia  maskapai lion air memu...   

                                      tokens_stemmed  
0  [kontancoid, jakarta, lion, group, dukung, pen...  
1  [peristiwa, oktober, salah, satu, jatuh, pesaw...  
2  [sekretaris, pp, muhammadiyah, abdul, muti, kr...  
3  [larang, keras, ambil, konten, crawling, indek...  
4  [jakarta, cnbc, indonesia, maskapai, lion, air...  

Kolom Teks Final (String):
                                cleaned_article_text  \
0  kontancoid  jakarta lion group mendukung penuh...   
1  sejumlah peristiwa terjadi pada  oktober salah...   
2  sekretaris umum pp muhammadiyah abdul muti men...   
3  dilarang keras meng

### Sentiment Label

In [25]:
def get_sentiment_label_from_polarity(polarity):
    if polarity > 0.05:
        return 'Positive'
    elif polarity < -0.05:
        return 'Negative'
    else:
        return 'Netral'

df_articles['polarity'].fillna(0, inplace=True)
df_articles['sentiment_label'] = df_articles['polarity'].apply(get_sentiment_label_from_polarity)

print("DataFrame dengan kolom label sentimen baru:")
print(df_articles[['article_text', 'polarity', 'sentiment_label']].head())

DataFrame dengan kolom label sentimen baru:
                                        article_text  polarity sentiment_label
0  Reporter: Leni Wandira | Editor: Wahyu T.Rahma...  0.089297        Positive
1  SEJUMLAH peristiwa terjadi pada 29 Oktober. Sa...  0.163492        Positive
2  Sekretaris Umum PP Muhammadiyah Abdul Mu'ti me...  0.073193        Positive
3  Copyright © ANTARA 2023\nDilarang keras mengam...  0.000000          Netral
4  21 Feb 2025 oleh Husni Anggoro | dilihat 42259...  0.140405        Positive


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_articles['polarity'].fillna(0, inplace=True)


In [26]:
df_articles.to_csv("data/cleaned_article.csv", index=False)