# Topic Modelling

In [None]:
!pip install Sastrawi




"Sastrawi" merujuk pada sebuah proyek open-source di Indonesia yang dikembangkan untuk pemrosesan bahasa alami (Natural Language Processing atau NLP) dalam bahasa Indonesia. Sastrawi memiliki fokus khusus pada pemrosesan bahasa untuk Bahasa Indonesia, dan salah satu aspek utamanya adalah stemming atau pereduksian kata.



In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import warnings
import pandas as pd
import numpy as np
import nltk
import re
import csv

nltk.download('stopwords')
nltk.download('punkt')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/SalmatulFarida/datasett/main/dataPTA.csv')
df

Unnamed: 0.1,Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak
0,0,Pengembangan Game Edukasi 2 Dimensi Untuk Mate...,Nurrohmat Hidayatullah Akbar,"Arik Kurniawati, S.Kom. M.T.","Puji Rahayu Ningsih, S.Pd., M.Pd.",Materi struktur dasar algoritma pemrograman me...
1,1,Pengembangan Media Pembelajaran Sistem Bilanga...,Cholilah,"Arik Kurniawati, S.kom., MT.","Wanda Ramansyah, S.Pd., M.Pd","Pada Mata Pelajaran Sistem Komputer, siswa har..."
2,2,PENGARUH MEDIA PEMBELAJARAN E-LEARNING BERBASI...,TAUFIKUR RAHMAN,"MEDIKA RISNASARI, S.ST.,M.T.","MUCHAMAD ARIF, S.PD.,M.PD.",Penelitian ini bertujuan untuk mengetahui peng...
3,3,PROFIL BERPIKIR KRITIS SISWA KELAS X TKJ DITIN...,Yuliana Wardani,"Puji Rahayu Ningsih,S.Pd.,M.Pd","Sigit Dwi Saputo, S.Pd.,M.Pd",Abstrak\nPenelitian ini bertujuan untuk menget...
4,4,PENGEMBANGAN GAME EDUKASI 3D STRUKTUR ALGORITM...,Deny Prasetyo,"Arik Kurniawati, S. Kom., M. T.","Sigit Dwi Saputro, S.Pd., M. Pd.",Mata pelajaran pemrograman dasar merupakan sal...
...,...,...,...,...,...,...
345,345,PENGEMBANGAN PERANGKAT PEMBELAJARAN MODEL PEMB...,Nasiruddin,"Puji Rahayu Ningsih, S.Pd.,M.Pd","Muhamad Afif Effindi, S.Kom.,M.T",Abstrak\nPenelitian ini bertujuan untuk mengem...
346,346,Analisis Kesalahan Siswa Berdasarkan Prosedur ...,Samaaul Badi'ah,"Puji Rahayu Ningsih, S.Pd., M.Pd.","Medika Risnasari, S.ST., M.T",Siswa mengalami kesulitan dalam materi routing...
347,347,Pengembangan Media Pembelajaran Trainer Komput...,Mahmudy,"Wanda Ramansyah, S.Pd., M.Pd","Muhamad Afif Effindi, S.Kom., M.T",ABSTRAK\n\nPenelitian ini bertujuan untuk menu...
348,348,Pengembangan Perangkat Rencana Pembelajaran De...,Yudhistira PIP,"Ariesta Kartika Sari, S.Si., M.Pd","Wanda Ramansyah, S.Pd., M.Pd",Penelitian ini di latar belakangi oleh perangk...


####cleaning data

Pembersihan data (data cleaning) dalam pemrosesan bahasa alami (NLP) adalah serangkaian langkah untuk mengolah dan membersihkan data teks sehingga dapat diolah dengan lebih baik oleh model atau algoritma pemrosesan bahasa alami. Pembersihan data merupakan tahap penting dalam pengolahan teks karena dapat membantu meningkatkan kualitas analisis atau model yang dibangun.

In [None]:
def cleaning(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text).strip()
  return text

df['data_clean'] = df['Abstrak'].apply(cleaning)
df['data_clean']

0      Materi struktur dasar algoritma pemrograman me...
1      Pada Mata Pelajaran Sistem Komputer siswa haru...
2      Penelitian ini bertujuan untuk mengetahui peng...
3      Abstrak\nPenelitian ini bertujuan untuk menget...
4      Mata pelajaran pemrograman dasar merupakan sal...
                             ...                        
345    Abstrak\nPenelitian ini bertujuan untuk mengem...
346    Siswa mengalami kesulitan dalam materi routing...
347    ABSTRAK\n\nPenelitian ini bertujuan untuk menu...
348    Penelitian ini di latar belakangi oleh perangk...
349    Penelitian ini bertujuan menghasilkan perangka...
Name: data_clean, Length: 350, dtype: object

####tokenizing

Tokenisasi adalah proses memecah teks atau kalimat menjadi unit-unit yang lebih kecil, yang disebut token. Token bisa berupa kata, frasa, atau entitas lainnya, tergantung pada konteks dan tingkat detail yang diinginkan. Tujuan dari tokenisasi adalah untuk mempermudah analisis atau pengolahan lebih lanjut pada teks.

In [None]:
def tokenizer(text):
  text = text.lower()
  return word_tokenize(text)

df['Tokenizing'] = df['data_clean'].apply(tokenizer)
df['Tokenizing']

0      [materi, struktur, dasar, algoritma, pemrogram...
1      [pada, mata, pelajaran, sistem, komputer, sisw...
2      [penelitian, ini, bertujuan, untuk, mengetahui...
3      [abstrak, penelitian, ini, bertujuan, untuk, m...
4      [mata, pelajaran, pemrograman, dasar, merupaka...
                             ...                        
345    [abstrak, penelitian, ini, bertujuan, untuk, m...
346    [siswa, mengalami, kesulitan, dalam, materi, r...
347    [abstrak, penelitian, ini, bertujuan, untuk, m...
348    [penelitian, ini, di, latar, belakangi, oleh, ...
349    [penelitian, ini, bertujuan, menghasilkan, per...
Name: Tokenizing, Length: 350, dtype: object

####stopword


Stopword adalah kata-kata umum yang biasanya diabaikan atau dihapus dalam proses analisis teks karena dianggap tidak memberikan kontribusi signifikan terhadap makna suatu kalimat. Stopwords biasanya terdiri dari kata-kata umum seperti "the", "and", "is", "in", dan sebagainya. Penghapusan stopwords membantu mengurangi kompleksitas data dan memfokuskan analisis pada kata-kata yang lebih bermakna.

In [None]:
#custome stopword

from nltk.corpus import stopwords

# Mengambil daftar stopword bahasa Indonesia dari NLTK
stopwords_indonesia = set(stopwords.words("indonesian"))

# Sekarang, Anda memiliki daftar stopword yang telah diperbarui
print(stopwords_indonesia)


{'tadi', 'setidak-tidaknya', 'dipersoalkan', 'setengah', 'tidak', 'semuanya', 'tandas', 'dimungkinkan', 'atau', 'cukupkah', 'teringat-ingat', 'sendiri', 'diri', 'terdahulu', 'selamanya', 'ibarat', 'lalu', 'tahun', 'meyakini', 'tanya', 'tentunya', 'tadinya', 'tengah', 'seringnya', 'mendatangi', 'menurut', 'sekadarnya', 'katakan', 'tanyakan', 'sudah', 'tegasnya', 'jawabnya', 'bagaimanakah', 'sebagian', 'sebuah', 'datang', 'boleh', 'sekaligus', 'menginginkan', 'dimisalkan', 'masing', 'khususnya', 'mengungkapkan', 'merasa', 'tersampaikan', 'keadaan', 'sayalah', 'belum', 'sekitar', 'amatlah', 'sejenak', 'malah', 'dialah', 'soal', 'dimaksud', 'maka', 'justru', 'bermula', 'sering', 'berjumlah', 'melihat', 'ataukah', 'oleh', 'meminta', 'setiap', 'segera', 'dilakukan', 'adanya', 'keinginan', 'jadi', 'dan', 'yakin', 'menggunakan', 'sepantasnyalah', 'menyiapkan', 'biasa', 'kurang', 'selanjutnya', 'dimulailah', 'sesegera', 'memerlukan', 'sebagai', 'kenapa', 'ia', 'mulanya', 'begitupun', 'setidakny

In [None]:
# Tambahkan kata-kata stopword kustom Anda ke dalam set
custom_stopwords = {"bisa-bisanya", "sebisanya", "mungkin"}

# Gabungkan kedua set stopword
stopwords_indonesia.update(custom_stopwords)

# Sekarang, Anda memiliki daftar stopword yang telah diperbarui
print(stopwords_indonesia)

{'tadi', 'setidak-tidaknya', 'dipersoalkan', 'setengah', 'tidak', 'semuanya', 'tandas', 'dimungkinkan', 'atau', 'cukupkah', 'teringat-ingat', 'sendiri', 'diri', 'terdahulu', 'selamanya', 'ibarat', 'lalu', 'tahun', 'meyakini', 'tanya', 'tentunya', 'tadinya', 'tengah', 'seringnya', 'mendatangi', 'menurut', 'sekadarnya', 'katakan', 'tanyakan', 'sudah', 'tegasnya', 'jawabnya', 'bagaimanakah', 'sebagian', 'sebuah', 'datang', 'boleh', 'sekaligus', 'menginginkan', 'dimisalkan', 'masing', 'khususnya', 'mengungkapkan', 'merasa', 'tersampaikan', 'keadaan', 'sayalah', 'belum', 'sekitar', 'amatlah', 'sejenak', 'malah', 'dialah', 'soal', 'dimaksud', 'maka', 'justru', 'bermula', 'sering', 'berjumlah', 'melihat', 'ataukah', 'oleh', 'meminta', 'setiap', 'segera', 'dilakukan', 'adanya', 'keinginan', 'jadi', 'dan', 'yakin', 'menggunakan', 'sepantasnyalah', 'menyiapkan', 'biasa', 'kurang', 'selanjutnya', 'dimulailah', 'sesegera', 'memerlukan', 'sebagai', 'kenapa', 'ia', 'mulanya', 'begitupun', 'setidakny

In [None]:
corpus = stopwords.words('indonesian')

def stopwordText(words):
 return [word for word in words if word not in corpus]

df['Stopword Removal'] = df['Tokenizing'].apply(stopwordText)

# Gabungkan kembali token menjadi kalimat utuh
df['stopword'] = df['Stopword Removal'].apply(lambda x: ' '.join(x))
df['stopword']

0      materi struktur dasar algoritma pemrograman ma...
1      mata pelajaran sistem komputer siswa memahami ...
2      penelitian bertujuan pengaruh media pembelajar...
3      abstrak penelitian bertujuan profil berpikir k...
4      mata pelajaran pemrograman dasar salah mata pe...
                             ...                        
345    abstrak penelitian bertujuan mengembangkan per...
346    siswa mengalami kesulitan materi routing mata ...
347    abstrak penelitian bertujuan menumbuhkan daya ...
348    penelitian latar belakangi perangkat pembelaja...
349    penelitian bertujuan menghasilkan perangkat pe...
Name: stopword, Length: 350, dtype: object

####tf-idf


TF-IDF (Term Frequency-Inverse Document Frequency) adalah suatu metode dalam pemrosesan bahasa alami (NLP) yang digunakan untuk memberikan bobot (weight) pada kata-kata dalam suatu dokumen berdasarkan seberapa sering kata tersebut muncul dalam dokumen tersebut dan seberapa unik kata tersebut terhadap seluruh korpus dokumen.

In [None]:
def tfidf(dokumen):
  vectorizer = TfidfVectorizer()
  x = vectorizer.fit_transform(dokumen).toarray()
  terms = vectorizer.get_feature_names_out()

  final_tfidf = pd.DataFrame(x, columns=terms)
  final_tfidf.insert(0, 'Abstrak', dokumen)

  return (vectorizer, final_tfidf)

tfidf_vectorizer, final_tfidf = tfidf(df['stopword'])
final_tfidf

Unnamed: 0,Abstrak,absensi,abstra,abstrak,acak,acccess,accelerated,acceptance,acception,access,...,yangrendah,yoga,yslow,yudhistira,yx,zaman,zhitung,zona,ztabel,zulfatun
0,materi struktur dasar algoritma pemrograman ma...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,mata pelajaran sistem komputer siswa memahami ...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,penelitian bertujuan pengaruh media pembelajar...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abstrak penelitian bertujuan profil berpikir k...,0.0,0.0,0.047875,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,mata pelajaran pemrograman dasar salah mata pe...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,abstrak penelitian bertujuan mengembangkan per...,0.0,0.0,0.042221,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,siswa mengalami kesulitan materi routing mata ...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
347,abstrak penelitian bertujuan menumbuhkan daya ...,0.0,0.0,0.045387,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,penelitian latar belakangi perangkat pembelaja...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
final_tfidf.to_csv('HasilTF-IDF.csv', index=False)

####term frequensi

informasi merujuk pada seberapa sering sebuah istilah atau kata tertentu muncul dalam kumpulan data tertentu, seperti dokumen, kumpulan dokumen, atau korpus.

In [None]:
def term_freq(dokumens):
  # Buat objek CountVectorizer
  vectorizer = CountVectorizer()
  tf_matrix = vectorizer.fit_transform(dokumens).toarray()
  terms = vectorizer.get_feature_names_out()

  final_tf = pd.DataFrame(tf_matrix, columns=terms)
  final_tf.insert(0, 'Abstrak', dokumens)

  return (vectorizer, final_tf, tf_matrix, terms)

tf_vectorizer, final_tf, tf_matrix, tf_terms = term_freq(df['stopword'])
final_tf

Unnamed: 0,Abstrak,absensi,abstra,abstrak,acak,acccess,accelerated,acceptance,acception,access,...,yangrendah,yoga,yslow,yudhistira,yx,zaman,zhitung,zona,ztabel,zulfatun
0,materi struktur dasar algoritma pemrograman ma...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,mata pelajaran sistem komputer siswa memahami ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,penelitian bertujuan pengaruh media pembelajar...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,abstrak penelitian bertujuan profil berpikir k...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,mata pelajaran pemrograman dasar salah mata pe...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,abstrak penelitian bertujuan mengembangkan per...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346,siswa mengalami kesulitan materi routing mata ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
347,abstrak penelitian bertujuan menumbuhkan daya ...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
348,penelitian latar belakangi perangkat pembelaja...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
final_tf.to_csv('HasilTermFrequensi.csv', index=False)


####logarithm frequensi

 Logaritma frekuensi sering digunakan untuk mengatasi masalah ketidakseimbangan dalam representasi frekuensi kata-kata dalam dokumen.

In [None]:
def logarithm_freq(dokumens):
  return np.log10(dokumens + 1)

df_logarithm_freq = pd.DataFrame(tf_matrix, columns=tf_terms).apply(logarithm_freq)
df_logarithm_freq.insert(0, 'Abstrak', df['stopword'])
df_logarithm_freq

Unnamed: 0,Abstrak,absensi,abstra,abstrak,acak,acccess,accelerated,acceptance,acception,access,...,yangrendah,yoga,yslow,yudhistira,yx,zaman,zhitung,zona,ztabel,zulfatun
0,materi struktur dasar algoritma pemrograman ma...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,mata pelajaran sistem komputer siswa memahami ...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,penelitian bertujuan pengaruh media pembelajar...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abstrak penelitian bertujuan profil berpikir k...,0.0,0.0,0.30103,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,mata pelajaran pemrograman dasar salah mata pe...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,abstrak penelitian bertujuan mengembangkan per...,0.0,0.0,0.30103,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,siswa mengalami kesulitan materi routing mata ...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
347,abstrak penelitian bertujuan menumbuhkan daya ...,0.0,0.0,0.30103,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,penelitian latar belakangi perangkat pembelaja...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
