<a href="https://colab.research.google.com/github/SalikFillah/Topic-Modelling/blob/main/Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalasi Modul

In [1]:
!pip install duckduckgo-search
!pip install nltk
!pip install Sastrawi
!pip install regex
!pip install unidecode
!pip install textblob
!pip install tqdm
!pip install scapy
!pip install python-crfsuite
!pip install gensim
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting duckduckgo-search
  Downloading duckduckgo_search-2.9.3-py3-none-any.whl (30 kB)
Collecting diskcache>=5.6.1
  Downloading diskcache-5.6.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.29.0
  Downloading requests-2.30.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: requests, diskcache, duckduckgo-search
  Attempting uninstall: requests
    Found existing installation: requests 2.27.1
    Uninstalling requests-2.27.1:
      Successfully uninstalled requests-2.27.1
Successfully installed diskcache-5.6.1 duckduckgo-search-2.9.3 requests-2.30.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/p

## Import Modul

In [2]:
import pandas as pd
import numpy as np
import string
import nltk

# modul stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stemmer = StemmerFactory().create_stemmer()

# modul preprocessing
nltk.download('punkt')
import re
from unidecode import unidecode
from html import unescape
from textblob import TextBlob
from tqdm import tqdm

# modul lemma & pos-tag
import spacy
from spacy.lang.id import Indonesian
from nltk.tag import CRFTagger
ct = CRFTagger()

# modul LDA
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# modul DRT
from sklearn.manifold import TSNE

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Import Data

In [3]:
from datetime import datetime, timedelta
from duckduckgo_search import ddg

In [4]:
def duckweb(keywords='', N=30, lang='Id-Id', time=''):
    return ddg(keywords, region=lang, safesearch='off', time=time, max_results=N)

In [5]:
delta = timedelta(days=1)

In [6]:
start_date = datetime(2023, 4, 1)  # Tanggal mulai
end_date = datetime(2023, 4, 5)  # Tanggal akhir


df_list = []
while start_date < end_date:
    next_date = start_date + delta  # Tanggal berikutnya
    time_range = f"{start_date.strftime('%Y-%m-%d')}..{next_date.strftime('%Y-%m-%d')}"  # Rentang waktu untuk pencarian
    query = 'Anies Baswedan 2024 +site:www.instagram.com'
    print(time_range)
    results = duckweb(keywords=query, time=time_range)

    df = pd.DataFrame(results)
    
    df['date'] = start_date.strftime('%Y-%m-%d')
    df_list.append(df)

    start_date = next_date  # Setel tanggal berikutnya sebagai tanggal mulai

df_1 = pd.concat(df_list)

2023-04-01..2023-04-02
2023-04-02..2023-04-03
2023-04-03..2023-04-04
2023-04-04..2023-04-05


## Load Data

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/SalikFillah/Topic-Modelling/main/anies_twitter.csv')
df.head()

Unnamed: 0,title,href,body,date
0,Anies Rasyid Baswedan on Twitter,https://twitter.com/aniesbaswedan/status/16308...,Anies Rasyid Baswedan on Twitter-Anies Rasyid ...,2023-03-01
1,"on Twitter: ""Cuma Anies kepala daerah yg mampu...",https://twitter.com/ekowboy2/status/1630949801...,"on Twitter: ""Cuma Anies kepala daerah yg mampu...",2023-03-01
2,PKS memunculkan wacana duet 'CLBK' antara Anie...,https://twitter.com/detikcom/status/1631170165...,PKS memunculkan wacana duet 'CLBK' antara Anie...,2023-03-02
3,"detikcom on Twitter: ""Sampai saat ini cawapres...",https://twitter.com/detikcom/status/1631117058...,"detikcom on Twitter: ""Sampai saat ini cawapres...",2023-03-02
4,"tvOnenews on Twitter: ""Anies Baswedan menyampa...",https://twitter.com/tvOneNews/status/163123839...,"tvOnenews on Twitter: ""Anies Baswedan menyampa...",2023-03-02


## Preprocessing Data

### Load Slang atau Singkatan
Modifikasi sesuka hati jika sekiranya masih terdapat singkatan yang perlu diubah.

In [8]:
import urllib.request

# membaca data slang dari URL Github
url = 'https://raw.githubusercontent.com/taudataid/eLearning/master/data/slang.dic'
response = urllib.request.urlopen(url)
slang_id = eval(response.read())

# mencetak dictionary dalam format yang diinginkan
print("{")
for key, value in slang_id.items():
    print(f"    \"{key}\": \"{value}\",")
print("}")

{
    "1pun": "satupun",
    "7an": "tujuan",
    "Dr.": "doktor",
    "dr.": "dokter",
    "drg.": "dokter gigi",
    "Jkt": "Jakarta",
    "Jkw": "jokowi",
    "Napza": "narkoba psikotropika dan zat adiktif",
    "Nasihat": "nasehat",
    "ababil": "abg labil",
    "abis": "habis",
    "acc": "accord",
    "accord": "sesuai",
    "ad": "ada",
    "adl": "adalah",
    "adlah": "adalah",
    "adlh": "adalah",
    "administ": "administrasi",
    "adoh": "aduh",
    "afaik": "as far as i know",
    "agma": "agama",
    "aha": "tertawa",
    "ahaha": "haha",
    "ahiok": "ahok",
    "mehong": "mahal",
    "ahoax": "ahok",
    "ahokncc": "ahok",
    "aing": "saya",
    "aj": "saja",
    "aja": "saja",
    "ajak2": "ajak-ajak",
    "ajep-ajep": "dunia gemerlap",
    "ajj": "saja",
    "ak": "aku",
    "aka": "dikenal juga sebagai",
    "akherat": "akhirat",
    "akhirx": "akhirnya",
    "akika": "aku",
    "akko": "aku",
    "akkoh": "aku",
    "akku": "aku",
    "akn": "akan",
    "aktifis

### Load Stopword
Modifikasi sesuka hati jika sekiranya masih terdapat kata yang seharusnya dihilangkan.

In [9]:
import requests
import re

url = "https://raw.githubusercontent.com/taudataid/eLearning/master/data/stopwords_id.txt"
response = requests.get(url)
stopwords = set(re.findall(r'\w+', response.text))

# Tambahkan set tambahan ke dalam set stopwords
tambahan = set(['a', 'akan', 'atas', 
                'b', 'bisa', 'bahwa', 'bacapres', 'banyak',
                'c', 'comment', 'comments',
                'd', 'dari', 'di', 'diri',
                'e', 
                'f', 'followers', 'following', 'front', 
                'g', 
                'h',
                'i', 'instagram', 'inilahcom',
                'j', 'jateng', 'jalan', 
                'k',
                'l', 'like', 'likes', 'lalu',
                'm', 'menjadi', 'mulai', 'makin', 'meski',
                'n', 
                'o', 
                'p', 'photos', 'posts', 'punya',
                'q', 
                'r', 'rizieq',
                's', 'sebagai', 'saja', 'sama', 'orang', 'salah', 'selalu', 'satu', 'sindonews', 'saling', 'see',
                't', 'tribun',
                'u', 
                'v', 
                'w', 
                'x', 
                'y',
                'z',])
stopwords = stopwords.union(tambahan)


# Meletakkan setiap kata dalam kurung kurawal dan memberikan tanda kutip
stopwords_id = "{" + ", ".join([f'"{word}"' for word in stopwords]) + "}"

print(len(stopwords))

791


### NLP (Natural Language Preprocessing)
Membersihkan data teks dari karakter-karakter yang tidak diperlukan serta menangani stopword dan slang atau singkatan yang sudah di load sebelumnya dan lain sebagainya.

Note : setiap platform media sosial memiliki cara pembersihan data nya masing-masing (modifikasi sesuka hati).

In [10]:
def cleanbody(text):
    
    # menghapus url
    url_pattern = re.compile(r'(\w+:\/\/\S+)')
    text = url_pattern.sub(' ', text)
    
    # menghapus hashtag
    hashtag_pattern = re.compile(r'#\w+\b')
    text = hashtag_pattern.sub(' ', text)
    
    # menghapus nama pengguna instagram (memuat "@")
    username_pattern = re.compile(r'@\w+\b\s*')
    text = username_pattern.sub(' ', text)
    
    # menghapus angka
    text = re.sub(r'\b\d+\b|[^\w\s]', '', text)
    
    # mengahpus simbol
    symbol_pattern = re.compile(r'[^\w\s]+')
    text = symbol_pattern.sub(' ', text)
    
    # menghapus karakter yang tidak diperlukan (tergantung media sosial)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'&\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # menangani huruf kapital dan spasi
    text = unidecode(unescape(text.lower().strip()))
    
    # menangani slang atau singkatan 
    Tokens = TextBlob(text).words
    for i,t in enumerate(Tokens):
      if t in slang_id.keys():
        Tokens[i] = slang_id[t]
        
    # menangani stopword
    text = ' '.join([t for t in Tokens if str(t) not in stopwords_id and len(t)>2])
    
    # stemming
    text = stemmer.stem(text)
    
    
    return text

In [11]:
# aplikasikan fungsi ke dalam kolom variabel baru
df['clean_body'] = ''
for idx, post in tqdm(df.iterrows()):
    df.at[idx, 'clean_body'] = cleanbody(post.body)

58it [01:00,  1.04s/it]


In [12]:
df.head()

Unnamed: 0,title,href,body,date,clean_body
0,Anies Rasyid Baswedan on Twitter,https://twitter.com/aniesbaswedan/status/16308...,Anies Rasyid Baswedan on Twitter-Anies Rasyid ...,2023-03-01,anies rasyid baswedan twitteranies rasyid basw...
1,"on Twitter: ""Cuma Anies kepala daerah yg mampu...",https://twitter.com/ekowboy2/status/1630949801...,"on Twitter: ""Cuma Anies kepala daerah yg mampu...",2023-03-01,twitter anies kepala daerah selenggara event a...
2,PKS memunculkan wacana duet 'CLBK' antara Anie...,https://twitter.com/detikcom/status/1631170165...,PKS memunculkan wacana duet 'CLBK' antara Anie...,2023-03-02,partai adil sejahtera muncul wacana duet cinta...
3,"detikcom on Twitter: ""Sampai saat ini cawapres...",https://twitter.com/detikcom/status/1631117058...,"detikcom on Twitter: ""Sampai saat ini cawapres...",2023-03-02,detikcom twitter cawapres anies newsdetikcom p...
4,"tvOnenews on Twitter: ""Anies Baswedan menyampa...",https://twitter.com/tvOneNews/status/163123839...,"tvOnenews on Twitter: ""Anies Baswedan menyampa...",2023-03-02,tvonenews twitter anies baswedan anies basweda...


### Lemmatisasi & Pos-Tag
Bagian preprocessing yang paling vital atau krusial untuk proses LDA dari yang lainnya, yaitu identifikasi kata dasar serta pengelompokkan kata berdasarkan kategori kata tersebut.

In [13]:
# lemma & pos-tag bahasa indonesia
nlp_id = Indonesian()
!wget -P data/ https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/data/all_indo_man_tag_corpus_model.crf.tagger
ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger')

--2023-05-09 03:58:16--  https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/data/all_indo_man_tag_corpus_model.crf.tagger
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1722780 (1.6M) [application/octet-stream]
Saving to: ‘data/all_indo_man_tag_corpus_model.crf.tagger’


2023-05-09 03:58:16 (23.9 MB/s) - ‘data/all_indo_man_tag_corpus_model.crf.tagger’ saved [1722780/1722780]



In [14]:
def NLPfilter(t, filters):

  # lemmatisasi
  tokens = nlp_id(t)

  # tokenisasi
  tokens = [str(k) for k in tokens if len(k)>2]

  # pos-tag
  hasil = ct.tag_sents([tokens])
  
  return [k[0] for k in hasil[0] if k[1] in filters]

  Note :
- NN : kata benda tunggal (meja, buku, kucing, cinta, ...)
- NNP : kata benda tunggal khusus (indonesia, google, nike, tokyo, ...)
- NNS : kata benda jamak (buku-buku, meja-meja, ...)
- NNPS : kata benda jamak khusus (beatles, avengers, simpsons, ...)
- JJ : kata sifat (marah, tinggi, besar, indah, ...)

In [15]:
# ambil variabel kolom hasil preprocessing
data = df['clean_body'].values

# pilih kategori kata 
filters = set(['NN', 'NNP', 'NNS', 'NNPS', 'JJ'])

# aplikasikan fungsi ke dalam dataframe baru
data_postTag = []
for i, d in tqdm(enumerate(data)):
    data_postTag.append(NLPfilter(d,filters))

' '.join(data_postTag[0])

58it [00:00, 1118.53it/s]


'baswedan alam young global dunia cerita transformasi jakarta kota global'

In [16]:
# tokenisasi data kembali
data = [d for d in data_postTag if d]

## LDA (Latent Dirichlet Allocation)


In [17]:
# membuat representasi dictionary dari dokumen

# membuang token yang langka dan umum
dictionary_t = Dictionary(data)
dictionary_t.filter_extremes(no_below=2, no_above=0.90)

# membuat dictionary dan corpus yang diperlukan topic modelling
corpus_t = [dictionary_t.doc2bow(doc) for doc in data]
corpus_t = [t for t in corpus_t if t] # membuang corpus atau dokumen yang kosong

print('Number of unique tokens: %d' % len(dictionary_t))
print('Number of documents: %d' % len(corpus_t))
print(corpus_t[:1])

Number of unique tokens: 112
Number of documents: 56
[[(0, 1), (1, 1), (2, 1), (3, 1)]]


Pembuatan dataframe hasil keluaran dari algoritma LDA (Latent Dhiriclet Allocation).

In [18]:
def format_topics_sentences(ldamodel, corpus, texts, dates):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([
                    sent_topics_df,                   
                pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]],
                             columns=["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"])],
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    contents = pd.Series(texts)

    sent_topics_df = pd.concat([sent_topics_df, contents, pd.Series(dates)], axis=1)
    return sent_topics_df

Model DRT (Dimensional Reduction Technique).

In [19]:
def tsne_analysis(ldamodel, corpus):
    topic_weights = []
    for i, row_list in enumerate(ldamodel[corpus]):
        topic_weights.append([w for i, w in row_list])

    # Array of topic weights
    df_topics = pd.DataFrame(topic_weights).fillna(0).values

    # Keep the well separated points (optional)
    # arr = arr[np.amax(arr, axis=1) > 0.35]

    # Dominant topic number in each doc
    topic_nums = np.argmax(df_topics, axis=1)

    # tSNE Dimension Reduction
    try:
        tsne_model = TSNE(
            n_components=2, verbose=1, random_state=0, angle=0.99, init="pca"
        )
        tsne_lda = tsne_model.fit_transform(df_topics)
    except:
        print("TSNE_ANALYSIS WENT WRONG, PLEASE RE-CHECK YOUR BANK DATASET")
        return (topic_nums, None)

    return (topic_nums, tsne_lda)

Penggabungan dataframe sedemikian sehingga data ini lah yang nantinya di aplikasikan ke dalah dashboard LDA.

In [20]:
def lda_analysis(df):
    
    docs = list(df["clean_body"].values)

    punctuations = string.punctuation

    processed_docs = data
    print("Jumlah corpus atau dokumen", len(processed_docs))
    if len(processed_docs) < 11:
        print("INSUFFICIENT DOCS TO RUN LINEAR DISCRIMINANT ANALYSIS")
        return (None, None, None, None)

    print("Jumlah BoW (Bag of Words) corpus", len(corpus_t))
    print("Jumlah dictionary", len(list(dictionary_t.keys())))
    if len(list(dictionary_t.keys())) < 1:
        print("INSUFFICIENT DICTS TO RUN LINEAR DISCRIMINANT ANALYSIS")
        return (None, None, None, None)

    lda_model = LdaModel(
        corpus_t, num_topics=5, id2word=dictionary_t, passes=10
    )

    df_topic_sents_keywords = format_topics_sentences(
        ldamodel=lda_model,
        corpus=corpus_t,
        texts=docs,
        dates=list(df["date"].values),
    )
    print("Jumlah data", len(df_topic_sents_keywords))
    print("Data", df_topic_sents_keywords.head())
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = [
        "Document_No",
        "Dominant_Topic",
        "Topic_Perc_Contrib",
        "Keywords",
        "Text",
        "Date",
    ]

    print("Hasil DRT")
    topic_num, tsne_lda = tsne_analysis(lda_model, corpus_t)

    return (tsne_lda, lda_model, topic_num, df_dominant_topic)

In [21]:
# aplikasikan fungsi dan simpan hasilnya ke dalam beberapa variabel berikut
tsne_lda, lda_model, topic_num, df_dominant_topic = lda_analysis(df)

Jumlah corpus atau dokumen 56
Jumlah BoW (Bag of Words) corpus 56
Jumlah dictionary 112
Jumlah data 58
Data    Dominant_Topic  Perc_Contribution  \
0             1.0             0.8369   
1             3.0             0.9259   
2             2.0             0.9722   
3             2.0             0.9526   
4             2.0             0.8381   

                                      Topic_Keywords  \
0  baswedan, twitter, anies, advokat, presiden, r...   
1  anies, twitter, demokrat, baswedan, daerah, pe...   
2  partai, adil, anies, baswedan, twitter, presid...   
3  partai, adil, anies, baswedan, twitter, presid...   
4  partai, adil, anies, baswedan, twitter, presid...   

                                                   0           1  
0  anies rasyid baswedan twitteranies rasyid basw...  2023-03-01  
1  twitter anies kepala daerah selenggara event a...  2023-03-01  
2  partai adil sejahtera muncul wacana duet cinta...  2023-03-02  
3  detikcom twitter cawapres anies newsdetikco

In [22]:
# data untuk pembuatan dashboard
df_dominant_topic = df_dominant_topic.dropna()
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Date
0,0,1.0,0.8369,"baswedan, twitter, anies, advokat, presiden, r...",anies rasyid baswedan twitteranies rasyid basw...,2023-03-01
1,1,3.0,0.9259,"anies, twitter, demokrat, baswedan, daerah, pe...",twitter anies kepala daerah selenggara event a...,2023-03-01
2,2,2.0,0.9722,"partai, adil, anies, baswedan, twitter, presid...",partai adil sejahtera muncul wacana duet cinta...,2023-03-02
3,3,2.0,0.9526,"partai, adil, anies, baswedan, twitter, presid...",detikcom twitter cawapres anies newsdetikcom p...,2023-03-02
4,4,2.0,0.8381,"partai, adil, anies, baswedan, twitter, presid...",tvonenews twitter anies baswedan anies basweda...,2023-03-02
5,5,3.0,0.7097,"anies, twitter, demokrat, baswedan, daerah, pe...",detikcom twitter majelis demokrat gelar pakat ...,2023-03-02
6,6,1.0,0.9463,"baswedan, twitter, anies, advokat, presiden, r...",gelora twitterusai gaya hedon jabat pajak bong...,2023-03-02
7,7,2.0,0.8979,"partai, adil, anies, baswedan, twitter, presid...",moga anies baswedan presiden https twittersemo...,2023-03-07
8,8,3.0,0.7979,"anies, twitter, demokrat, baswedan, daerah, pe...",anies rasyid baswedan twitteranies rasyid basw...,2023-03-07
9,9,1.0,0.8625,"baswedan, twitter, anies, advokat, presiden, r...",twitter citra turun citra turun tiran pki otak...,2023-03-07


In [23]:
topic_top3words = [
    (i, topic)
    for i, topics in lda_model.show_topics(formatted=False)
    for j, (topic, wt) in enumerate(topics)
    if j < 3
]

df_top3words_stacked = pd.DataFrame(
    topic_top3words, columns=['topic_id', 'words']
)
df_top3words = df_top3words_stacked.groupby("topic_id").agg(", ".join)
df_top3words.reset_index(level=0, inplace=True)

df_top3words

Unnamed: 0,topic_id,words
0,0,"anies, indonesia, insyaallah"
1,1,"baswedan, twitter, anies"
2,2,"partai, adil, anies"
3,3,"anies, twitter, demokrat"
4,4,"baswedan, anies, nama"


In [24]:
tsne_df = pd.DataFrame({
    "tsne_x": tsne_lda[:, 0],
    "tsne_y": tsne_lda[:, 1],
    "topic_num": topic_num,
    "doc_num": df_dominant_topic["Document_No"],
}
)

In [25]:
tsne_df

Unnamed: 0,tsne_x,tsne_y,topic_num,doc_num
0,-29.978558,-2.859974,1,0
1,-29.755043,-6.481581,3,1
2,-26.331724,-5.846019,0,2
3,-31.710438,-4.17339,2,3
4,-31.419201,-4.605425,2,4
5,-29.658388,-5.828054,3,5
6,-30.001173,-2.605744,1,6
7,-31.590034,-4.341528,2,7
8,-29.45406,-6.025149,3,8
9,-30.014652,-2.757243,1,9


In [26]:
from google.colab import files

df_dominant_topic.to_csv('tsne_df.csv', index=False)
files.download('tsne_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Referensi
1.   https://taudata.blogspot.com/2022/05/nlptm-07.html
2.   https://github.com/plotly/dash-sample-apps/blob/main/apps/dash-nlp/ldacomplaints.py
3. https://github.com/taudataid/eLearning