# Tugas 9 (LSA Topic Modelling)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# TOPIK MODELLING

Topik modelling digunakan untuk mengelompokkan data berdasarkan dengan topik tertentu.

Data : data yang digunakan yaitu dari komentar dengan kata kunci "capres2024".

Tugas : LSA : TF IDF & SVD (Singuler Value Decomposition) 

Output : 
- Bobot kata terhadap masing masing topik 
- Bobot setiap topik terhadap  dokumen

### Import Library

In [None]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/RBellaApriliaDamayanti22/Datasets/main/commentyoutube.csv')
df

Unnamed: 0,comment,comment (clean)
0,ID\nSkip navigation\nSign in\nWorship Piano: B...,id skip navigation sign ini worship piano begi...
1,"Pak Prabowo Subianto saya mohon,\nAnda jangan ...",pak prabowo subianto saya mohon anda jangan be...
2,gw dan sekeluarga adalah loyalis PDI Perjuanga...,gue dan sekeluarga adalah loyalis pdi perjuang...
3,"ganjar salah satu yang menolak israel, berakib...",ganjar salah satu yang menolak israel berakiba...
4,"Pak Prabowo sekarang auranya adem,beda waktu n...",pak prabowo sekarang auranya adem beda waktu n...
...,...,...
1160,Di,di
1161,Siapa pula yg mau pilih prabowo....history mas...,siapa pula yang mau pilih prabowo history masa...
1162,Lembaga survei bayaran\nAslinya pak Anies yg t...,lembaga survei bayaran aslinya pak anies yang ...
1163,Siapapun calonya presiden 2024\nKami rakyat t...,siapapun calonya presiden kami rakyat tetap an...


### Modelling

In [None]:
import nltk
nltk.download('stopwords', quiet=True)

True

In [None]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                        stop_words=stopwords,
                        tokenizer = tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['comment (clean)'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)



### bobot kata terhadap masing masing topik

In [None]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('prabowo', 0.7521155299878107), ('ganjar', 0.3372049759541212)]
Topic 1:  [('ganjar', 0.7503087350836983), ('pilih', 0.18707433096166365)]
Topic 2:  [('mahfud', 0.5854867878603073), ('md', 0.5186006537063022)]
Topic 3:  [('anis', 0.5705283022773839), ('pilih', 0.37813805344717316)]


### bobot setiap topik terhadap dokumen

In [None]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=["Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["comment (clean)"], df_lsa], axis=1)
df_lsa['Topik']= df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,comment (clean),Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,id skip navigation sign ini worship piano begi...,0.152406,0.051516,-0.010353,0.048775,0
1,pak prabowo subianto saya mohon anda jangan be...,0.307097,0.188707,0.024481,-0.116215,0
2,gue dan sekeluarga adalah loyalis pdi perjuang...,0.054225,0.070504,-0.006355,0.016698,1
3,ganjar salah satu yang menolak israel berakiba...,0.071637,0.146175,0.047347,-0.040282,1
4,pak prabowo sekarang auranya adem beda waktu n...,0.098516,-0.021400,-0.033330,-0.031598,0
...,...,...,...,...,...,...
1160,di,0.000000,0.000000,0.000000,0.000000,0
1161,siapa pula yang mau pilih prabowo history masa...,0.107990,0.008479,-0.050384,0.057005,0
1162,lembaga survei bayaran aslinya pak anies yang ...,0.030155,0.034397,-0.036700,0.113350,3
1163,siapapun calonya presiden kami rakyat tetap an...,0.118928,0.048969,-0.128882,0.278921,3


In [None]:
df_lsa['Topik'].value_counts()


0    558
3    327
1    218
2     62
Name: Topik, dtype: int64