<a href="https://colab.research.google.com/github/SalikFillah/Topic-Modelling/blob/main/Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalasi Modul

In [None]:
!pip install nltk
!pip install Sastrawi
!pip install regex
!pip install Unidecode
!pip install html
!pip install textblob
!pip install tqdm

## Import Modul

In [1]:
import pandas as pd
import numpy as np
import string
import nltk

# modul preprocessing
nltk.download('punkt')
import re
from unidecode import unidecode
from html import unescape
from textblob import TextBlob
from tqdm import tqdm
# modul stopword & stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

## Load Data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/SalikFillah/Topic-Modelling/main/anies_baswedan_2024.csv')
df.head()

Unnamed: 0,title,href,body,date
0,"ANIES BASWEDAN INDONESIA on Instagram: ""Imam B...",https://www.instagram.com/p/CHKVWZrnYOT/,"940 likes, 43 comments - ANIES BASWEDAN INDONE...",2023-01-02
1,"BuddyKu Headlines on Instagram: ""Buddies! Baka...",https://www.instagram.com/p/Cn0MrvFPJdB/,Bakal Calon Presiden (Bacapres) 2024 dari Part...,2023-01-02
2,"SINDOnews on Instagram: ""Meski memiliki elekta...",https://www.instagram.com/p/CeAjzwnP0wi/,"309 likes, 29 comments - SINDOnews (@sindonews...",2023-01-02
3,"ANIES BASWEDAN INDONESIA on Instagram: ""Denger...",https://www.instagram.com/p/ClEVrd9IOEe/,"1,286 likes, 70 comments - ANIES BASWEDAN INDO...",2023-01-02
4,"ANIES BASWEDAN on Instagram: ""Calon Presiden I...",https://www.instagram.com/p/Cm6c9HFr7Cu/,"9 likes, 0 comments - ANIES BASWEDAN (@aniesra...",2023-01-02
5,"djawanewscom on Instagram: ""Nama Gubernur DKI ...",https://www.instagram.com/p/CXLJQeqNF8C/,"19 likes, 2 comments - @djawanewscom on Instag...",2023-01-02
6,"inilahcom on Instagram: ""Bakal calon presiden ...",https://www.instagram.com/p/CrU91LWvPpV/,"882 likes, 98 comments - inilahcom (@inilah_co...",2023-01-02
7,"Anies Untuk Perubahan on Instagram: ""Tidak ada...",https://www.instagram.com/p/Cra_aqzPFb2/,"11 likes, 2 comments - Anies Untuk Perubahan (...",2023-01-02
8,"inilahcom on Instagram: ""Gubernur DKI Jakarta ...",https://www.instagram.com/p/CXAlS8TPSQE/,"103 likes, 9 comments - inilahcom (@inilah_com...",2023-01-02
9,"BERGERAK FOR ANIES BASWEDAN on Instagram: ""BER...",https://www.instagram.com/p/CkI5vYZpeKB/,"29 likes, 0 comments - BERGERAK FOR ANIES BASW...",2023-01-02


## Preprocessing Data

### Load Slang atau Singkatan
modifikasi sesuka hati jika sekiranya masih terdapat singkatan yang perlu diubah.

In [None]:
slang = {'tdk':'tidak',
         'ketum':'ketua umum',
         'menjadi':'jadi',
         'timnas':'tim nasional',
         'membatalkan':'batal',
         'alas':'alasan', 
         'kelem':'kelemahan'}

### Load Stopword
modifikasi sesuka hati jika sekiranya masih terdapat kata yang seharusnya dihilangkan.

In [None]:
factory = StopWordRemoverFactory()
stemmer = StemmerFactory().create_stemmer()

# stopword di modul Sastrawi
Sastrawi_StopWords_id = set(factory.get_stop_words())

# stopword tambahan
tambahan = set(['a', 'akan', 
                'b', 'bisa', 'bahwa',
                'c', 'comment', 'comments',
                'd', 'dari', 'di',
                'e', 
                'f', 
                'g', 
                'h',
                'i', 'instagram',
                'j',
                'k',
                'l', 'like', 'likes',
                'm', 'menjadi',
                'n', 
                'o', 
                'p', 
                'q', 
                'r', 
                's', 'sebagai', 'saja',
                't',
                'u', 
                'v', 
                'w', 
                'x', 
                'y',
                'z',])

Sastrawi_StopWords_id = Sastrawi_StopWords_id.union(tambahan)      
print(Sastrawi_StopWords_id)

### NLP (Natural Language Preprocessing)
membersihkan data teks dari karakter-karakter yang tidak diperlukan serta menangani stopword dan slang atau singkatan yang sudah di load sebelumnya dan lain sebagainya.

Note : setiap platform media sosial memiliki cara pembersihan data nya masing-masing (modifikasi sesuka hati).

In [None]:
def cleanbody(text):
    
    # menghapus url
    url_pattern = re.compile(r'(\w+:\/\/\S+)')
    text = url_pattern.sub(' ', text)
    
    # menghapus hashtag
    hashtag_pattern = re.compile(r'#\w+\b')
    text = hashtag_pattern.sub(' ', text)
    
    # menghapus nama pengguna instagram (memuat "@")
    username_pattern = re.compile(r'@\w+\b\s*')
    text = username_pattern.sub(' ', text)
    
    # menghapus angka
    text = re.sub(r'\b\d+\b|[^\w\s]', '', text)
    
    # mengahpus simbol
    symbol_pattern = re.compile(r'[^\w\s]+')
    text = symbol_pattern.sub(' ', text)
    
    # menghapus karakter yang tidak diperlukan (tergantung media sosial)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'&\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # menangani huruf kapital dan spasi
    text = unidecode(unescape(text.lower().strip()))
    
    # menangani slang atau singkatan 
    Tokens = TextBlob(text).words
    for i,t in enumerate(Tokens):
        if t in slang.keys():            
            Tokens[i] = slang[t]
        
    # menangani stopword
    text = ' '.join([t for t in Tokens if str(t) not in Sastrawi_StopWords_id and len(t)>2])
    
    # stemming
    text = stemmer.stem(text)
    
    
    return text

In [None]:
# aplikasikan fungsi ke dalam kolom variabel baru
df['clean_body'] = ''
for idx, post in tqdm(df.iterrows()):
    df.at[idx, 'clean_body'] = cleanbody(post.body)

In [None]:
df.head()

### Lemmatisasi & Pos-Tag
bagian preprocessing yang paling vital atau krusial untuk proses LDA dari yang lainnya, yaitu identifikasi kata dasar serta pengelompokkan kata berdasarkan kategori kata tersebut.

In [None]:
def NLPfilter(t, filters):

  # lemmatisasi
  tokens = nlp_id(t)

  # tokenisasi
  tokens = [str(k) for k in tokens if len(k)>2]

  # pos-tag
  hasil = ct.tag_sents([tokens])
  
  return [k[0] for k in hasil[0] if k[1] in filters]

  Note :
- NN : kata benda tunggal (meja, buku, kucing, cinta, ...)
- NNP : kata benda tunggal khusus (indonesia, google, nike, tokyo, ...)
- NNS : kata benda jamak (buku-buku, meja-meja, ...)
- NNPS : kata benda jamak khusus (beatles, avengers, simpsons, ...)
- JJ : kata sifat (marah, tinggi, besar, indah, ...)

In [None]:
# ambil variabel kolom hasil preprocessing
data = df['clean_body'].values

# pilih kategori kata 
filters = set(['NN', 'NNP', 'NNS', 'NNPS', 'JJ'])

# aplikasikan fungsi ke dalam dataframe baru
data_postTag = []
for i, d in tqdm(enumerate(data)):
    data_postTag.append(NLPfilter(d,filters))

' '.join(data_postTag[0])

In [None]:
# tokenisasi data kembali
data = [d for d in data_postTag if d]