Sumber text clustering using NLTK library: https://github.com/lucas-de-sa/national-anthems-clustering/blob/master/Cluster_Anthems.ipynb

Sumber text clustering using Spacy library: https://github.com/kirralabs/text-clustering/blob/master/script/core/Clustering.ipynb

# A. Import Library

### a). Library if use NLTK

In [1]:
# Data Structures
import numpy  as np
import pandas as pd
import geopandas as gpd
import json

# Corpus Processing
import re
import nltk.corpus
from unidecode                        import unidecode
from nltk.tokenize                    import word_tokenize
from nltk                             import SnowballStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.preprocessing            import normalize

# K-Means
from sklearn import cluster

# Visualization and Analysis
import matplotlib.pyplot  as plt
import matplotlib.cm      as cm
import seaborn            as sns
from sklearn.metrics                  import silhouette_samples, silhouette_score
from wordcloud                        import WordCloud

# Map Viz
import folium
#import branca.colormap as cm
from branca.element import Figure

### b). Library if use Spacy

In [31]:
import os, mpld3, nltk
import numpy as np
import pandas as pd

In [32]:
from spacy.lang.id import Indonesian
import fnmatch
def getAllFileinFolder(folderpath):
    filelist = []
    for dirpath, dirs, files in os.walk(folderpath):
        for filename in fnmatch.filter(files, '*.txt'):
            filelist.append(dirpath + "/" + filename)
    return filelist

def writedataa(list, thname):
    file = open("sentence_rep_{}.txt".format(thname), "w");
    for x in sorted(set(list)):
        # for x in list:
        # hasil = x.replace('"','').replace("#","").replace("&nbsp;","" )
        file.write(x + "\n")
    file.close()

nlp = Indonesian()
def tokenize_and_stem(text):
    text = u'{}'.format(text)
    doc = nlp(text)
    stems = [t.lemma_ for t in doc]
    stems = [t.lower() for t in stems]
    return stems


def tokenize_only(text):
    text = u'{}'.format(text)
    doc = nlp(text)
    stems = [t.text for t in doc]
    stems = [t.lower() for t in stems]
    return stems

# B. Import Dataset

In [254]:
data_raw = pd.read_csv('~/Downloads/data-ticket-karyawan.csv', encoding='utf-8')
data_raw.columns = map(str.lower, data_raw.columns) #membuat huruf awal nama kolom menjadi tidak kapital

sumber = ['Lapor PPDB']
data_raw = data_raw.loc[data_raw['sumber'].isin(sumber)]
data_raw.head(6)

Unnamed: 0,sumber,tahun bulan,issue id,tracker,created on,closed on,start date,due date,project,subject,state,poin,id resolver,nama resolver,grup resolver,id creator,nama creator,ts menit,wkt resolved,st lembur
0,Lapor PPDB,2024-06,21639,Support,2024-06-29 09:09:00,2024-07-01 05:11:11,2024-06-29,2024-06-29,Kota Denpasar,[Closing Engine] Kota Denpasar 2024 - Jalur Zo...,Closed,,69.0,Febrian Hilmi Firdaus,DSO,69,Febrian Hilmi Firdaus,,2024-06-29 14:30:41,Lembur
1,Lapor PPDB,2024-06,21662,Support,2024-06-29 12:51:50,2024-07-01 05:10:57,2024-06-29,2024-06-29,Prov. Bali,[Kendala] Siswa Masih Bisa Langsung Pengajuan ...,Closed,,69.0,Febrian Hilmi Firdaus,DSO,69,Febrian Hilmi Firdaus,,2024-07-01 05:10:57,Normal
2,Lapor PPDB,2024-06,21088,Support,2024-06-22 05:30:34,2024-06-22 10:18:51,2024-06-22,2024-06-29,Prov. Nusa Tenggara Timur,[Closing Engine] Prov NTT 2024 - SMA All Jalur...,Closed,,69.0,Febrian Hilmi Firdaus,DSO,69,Febrian Hilmi Firdaus,,2024-06-22 08:54:49,Normal
3,Lapor PPDB,2024-06,21689,Support,2024-07-01 02:28:58,2024-07-01 02:46:11,2024-06-30,2024-07-01,Kanwil Prov. DKI Jakarta,[DEV] Pengecekan dan Penyesuaian hasil seleksi...,Closed,5.0,4.0,Azhar Mashuri,DIP,4,Azhar Mashuri,,2024-07-01 02:46:06,Lembur
4,Lapor PPDB,2024-06,21479,Support,2024-06-26 14:13:47,2024-07-01 01:27:33,2024-06-26,2024-06-26,Kota Sukabumi,Pengecekan File Ajuan Pendaftaran di Operator ...,Closed,,74.0,Hayan .,DSO,74,Hayan .,,2024-06-26 14:24:31,Normal
5,Lapor PPDB,2024-06,21680,Support,2024-06-30 19:56:25,2024-06-30 22:43:27,2024-06-30,2024-06-30,Kota Serang,[DEV] Backend - Tolak Pilihan Provinsi Dan Kot...,Closed,1.0,62.0,Villa Nanda,DIP,4,Azhar Mashuri,,2024-06-30 21:41:11,Lembur


In [255]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2036 entries, 0 to 2035
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sumber         2036 non-null   object 
 1   tahun bulan    2036 non-null   object 
 2   issue id       2036 non-null   int64  
 3   tracker        2036 non-null   object 
 4   created on     2036 non-null   object 
 5   closed on      2036 non-null   object 
 6   start date     1630 non-null   object 
 7   due date       2036 non-null   object 
 8   project        2036 non-null   object 
 9   subject        2036 non-null   object 
 10  state          2036 non-null   object 
 11  poin           1712 non-null   float64
 12  id resolver    2036 non-null   float64
 13  nama resolver  2036 non-null   object 
 14  grup resolver  1952 non-null   object 
 15  id creator     2036 non-null   int64  
 16  nama creator   2036 non-null   object 
 17  ts menit       371 non-null    float64
 18  wkt resolved 

# Carpus Processing

### Corpus Loading

In [169]:
corpus = data_raw['subject'].tolist()
corpus[18][0:447]

'Override - Batasan Ajuan Pendaftaran Jalur Zonasi, PTO dan Prestasi'

## 1. Stop Words and Stemming

In [5]:
# removes a list of words (ie. stopwords) from a tokenized list.
def removeWords(listOfTokens, listOfWords):
    return [token for token in listOfTokens if token not in listOfWords]

# applies stemming to a list of tokenized words
def applyStemming(listOfTokens, stemmer):
    return [stemmer.stem(token) for token in listOfTokens]

# removes any words composed of less than 2 or more than 21 letters
def twoLetters(listOfTokens):
    twoLetterWord = []
    for token in listOfTokens:
        if len(token) <= 2 or len(token) >= 21:
            twoLetterWord.append(token)
    return twoLetterWord

## 2. The main corpus processing function

STOPWORDS UNTUK BAHASA INDONESIA: https://github.com/stopwords-iso/stopwords-id/blob/132f51cb383abf95a98beb33c4e1afb6933ff884/raw/indonesian-stopwords-complete.txt


Tutorial install nltk: https://www.google.com/search?sca_esv=67eac9aa2b9499b8&sxsrf=ADLYWIKFMpDR9XEWQVPMLfUljnd_t7hZSw:1721699518919&q=nltk+for+mac&tbm=vid&source=lnms&fbs=AEQNm0DVrIRjdA3gRKfJJ-deMT8ZtYOjoIt1NWOMRkEKym4u5PkAZgxJOmIgPx6WieMhF6q1Hq7W6nME2Vp0eHuijF3ZElaTgD0zbj1gkQrti2r6HpgEQJ__FI2P2zVbzOTQnx-xQGuWfPA7_LjHL8X54xCjPigLtLX638JLYGhCvRlpvvGBo-fNpc7q_rU8dgffCadMYeMgxPqmupqDpgcFpVxKo2EBMA&sa=X&ved=2ahUKEwjcuvDBhryHAxXGVmwGHXduBboQ0pQJegQIDBAB&biw=1393&bih=701&dpr=2#fpstate=ive&vld=cid:1e6e69e9,vid:wuQeKgXUZks,st:0

In [24]:
def processCorpus(corpus, language):   
    #stopwords = nltk.corpus.stopwords.words(language)
    param_stemmer = SnowballStemmer(language)
    #countries_list = [line.rstrip('\n') for line in open('lists/countries.txt')] # Load .txt file line by line
    #nationalities_list = [line.rstrip('\n') for line in open('lists/nationalities.txt')] # Load .txt file line by line
    #other_words = [line.rstrip('\n') for line in open('lists/stopwords_scrapmaker.txt')] # Load .txt file line by line
    
    for document in corpus:
        index = corpus.index(document)
        #corpus[index] = corpus[index].replace(u'\ufffd', '8')   # Replaces the ASCII '�' symbol with '8'
        corpus[index] = corpus[index].replace(',', '')          # Removes commas
        corpus[index] = corpus[index].rstrip('\n')              # Removes line breaks
        corpus[index] = corpus[index].casefold()                # Makes all letters lowercase
        
        #corpus[index] = re.sub('\W_',' ', corpus[index])        # removes specials characters and leaves only words
        #corpus[index] = re.sub("\S*\d\S*"," ", corpus[index])   # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
        #corpus[index] = re.sub("\S*@\S*\s?"," ", corpus[index]) # removes emails and mentions (words with @)
        #corpus[index] = re.sub(r'http\S+', '', corpus[index])   # removes URLs with http
        #corpus[index] = re.sub(r'www\S+', '', corpus[index])    # removes URLs with www

        listOfTokens = word_tokenize(corpus[index])
        twoLetterWord = twoLetters(listOfTokens)

        #listOfTokens = removeWords(listOfTokens, stopwords)
        listOfTokens = removeWords(listOfTokens, twoLetterWord)
        #listOfTokens = removeWords(listOfTokens, countries_list)
        #listOfTokens = removeWords(listOfTokens, nationalities_list)
        #listOfTokens = removeWords(listOfTokens, other_words)
        
        listOfTokens = applyStemming(listOfTokens, param_stemmer)
        #listOfTokens = removeWords(listOfTokens, other_words)

        corpus[index]   = " ".join(listOfTokens)
        corpus[index] = unidecode(corpus[index])

    return corpus

## Install NLTK for Mac

In [10]:
import nltk
import zipfile
import requests

In [11]:
# Step 1: Download the punkt.zip file
url = 'https://github.com/nltk/nltk_data/blob/gh-pages/packages/tokenizers/punkt.zip?raw=true'
r = requests.get(url, allow_redirects=True)
open('punkt.zip', 'wb').write(r.content)


13905355

In [12]:
# Step 2: Extract the contents of the zip file
with zipfile.ZipFile('punkt.zip', 'r') as zip_ref:
    zip_ref.extractall('/Users/mac/nltk_data/tokenizers')

In [13]:
# Step 3: Verify if the data is available
from nltk.data import find
find('tokenizers/punkt')

FileSystemPathPointer('/Users/mac/nltk_data/tokenizers/punkt/PY3')

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [15]:
import nltk
from nltk.tokenize import word_tokenize

In [16]:
# Ensure the NLTK data path includes the directory where 'punkt' is located
nltk.data.path.append('/Users/mac/nltk_data')

In [17]:
# Sample text
text = "Hello, how are you?"

In [18]:
# Tokenize the text
tokens = word_tokenize(text)
print(tokens)

['Hello', ',', 'how', 'are', 'you', '?']


## 2. Lanjutan Corpus

In [26]:
language = 'indonesia'
corpus = processCorpus(corpus, language)
corpus[18][0:460]

ValueError: The language 'indonesia' is not supported.

the language 'indonesia' or 'indonesian' is not supported by NLTK's punkt tokenizer. NLTK's punkt tokenizer primarily supports a limited number of languages.

For Indonesian language support, you can use other libraries like Sastrawi or spacy for tokenization.

# Use Spacy Library

### Pilih kolom yang akan digunakan

In [256]:
data_subj = data_raw['subject']
data = data_raw['subject']
data

0       [Closing Engine] Kota Denpasar 2024 - Jalur Zo...
1       [Kendala] Siswa Masih Bisa Langsung Pengajuan ...
2       [Closing Engine] Prov NTT 2024 - SMA All Jalur...
3       [DEV] Pengecekan dan Penyesuaian hasil seleksi...
4       Pengecekan File Ajuan Pendaftaran di Operator ...
                              ...                        
2031    Reset daftar Ajuan Pendaftaran Gabungan zona Demo
2032                    Override Engine Seleksi Kota Batu
2033       Dokumen SPH PPDB Online Kota Palangkaraya 2023
2034    Override Info Dari Formatter Kapasitas Jika Ya...
2035                 Konfirmasi daerah 3 Kota/Kab di TR I
Name: subject, Length: 2036, dtype: object

In [257]:
data.values #Menjadikan array

array(['[Closing Engine] Kota Denpasar 2024 - Jalur Zonasi Kategori Umum - Tahap 1',
       '[Kendala] Siswa Masih Bisa Langsung Pengajuan Pendaftaran di 2 Jenjang Sekaligus',
       '[Closing Engine] Prov NTT 2024 - SMA All Jalur - Tahap 1', ...,
       'Dokumen SPH PPDB Online Kota Palangkaraya 2023',
       'Override Info Dari Formatter Kapasitas Jika Yang Diakses Adalah Kota Batu',
       'Konfirmasi daerah 3 Kota/Kab di TR I'], dtype=object)

### Cleaning data

In [258]:
#Mengubah huruf kapital menjadi lower case
data = data.str.lower()
data

0       [closing engine] kota denpasar 2024 - jalur zo...
1       [kendala] siswa masih bisa langsung pengajuan ...
2       [closing engine] prov ntt 2024 - sma all jalur...
3       [dev] pengecekan dan penyesuaian hasil seleksi...
4       pengecekan file ajuan pendaftaran di operator ...
                              ...                        
2031    reset daftar ajuan pendaftaran gabungan zona demo
2032                    override engine seleksi kota batu
2033       dokumen sph ppdb online kota palangkaraya 2023
2034    override info dari formatter kapasitas jika ya...
2035                 konfirmasi daerah 3 kota/kab di tr i
Name: subject, Length: 2036, dtype: object

In [259]:
def clean_text(text):
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    text = re.sub(r"2024|2023|ppdb", "", text)
    text = re.sub(r"kota|kab|prov|kotakab", "", text)
    text = re.sub(r"balikpapan|bontang|samarinda|bangka|bantul|barito selatan|batang|berau|bungo|demak|gresik|grobogan|jepara|karimun|klaten|kudus|kulon progo| kutai timur|malang|pati|purbalingga|sleman|jakarta|banjarbaru|batu|bengkulu|bontang|denpasar|jambi|mataram|mojokerto|padang sidempuan|palangkaraya|pasuruan|pontianak|serang|sukabumi|yogyakarta|bali|banten|yogyakarta|jambi|jawa tengah|kalimantan tengah|kalimantan selatan|lampung|ntt|nusa tenggara timur", "", text)
    return text
    

# Apply the function to each element in the 'text' column
data = data.apply(clean_text)

data

0       closing engine     jalur zonasi kategori umum ...
1       kendala siswa masih bisa langsung pengajuan pe...
2               closing engine     sma all jalur  tahap 1
3       dev pengecekan dan penyesuaian hasil seleksi j...
4       pengecekan file ajuan pendaftaran di operator ...
                              ...                        
2031    reset daftar ajuan pendaftaran gabungan zona demo
2032                            override engine seleksi  
2033                               dokumen sph  online   
2034    override info dari formatter kapasitas jika ya...
2035                         konfirmasi daerah 3  di tr i
Name: subject, Length: 2036, dtype: object

In [260]:
data.values

array(['closing engine     jalur zonasi kategori umum  tahap 1',
       'kendala siswa masih bisa langsung pengajuan pendaftaran di 2 jenjang sekaligus',
       'closing engine     sma all jalur  tahap 1', ...,
       'dokumen sph  online   ',
       'override info dari formatter kapasitas jika yang diakses adalah  ',
       'konfirmasi daerah 3  di tr i'], dtype=object)

## Tokenization

In [261]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
from __future__ import print_function
totalvocab_stemmed = []
totalvocab_tokenized = []
dataarticle = list(data.values)
count = 0
for i in dataarticle:
    print("\rArticle process: {} from {}".format(count+1, len(dataarticle)), end="")
    count += 1
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
print ("")
print ("")

print ("total vacab stem: ", len(totalvocab_stemmed))
print ("total vacab tokenize: ",len(totalvocab_tokenized))

Article process: 2036 from 2036

total vacab stem:  15758
total vacab tokenize:  15758


In [262]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
# vocab_frame[:10] #print 10 word

there are 15758 items in vocab_frame


In [263]:
from spacy.lang.id import stop_words
from string import punctuation, digits
from sklearn.feature_extraction.text import TfidfVectorizer

STWRD = list(set(punctuation))
STWRD += list(set(digits))
STWRD += stop_words.STOP_WORDS

#tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 #min_df=2, stop_words=STWRD,
                                 #use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(list(data.values)) #fit the vectorizer to synopses

print("TF-idf matrix: ",tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names_out()

TF-idf matrix:  (2036, 1220)


In [264]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print
dist

array([[ 0.00000000e+00,  1.00000000e+00,  5.21789195e-01, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00, -2.22044605e-16,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  9.37113882e-01],
       [ 5.21789195e-01,  1.00000000e+00, -2.22044605e-16, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       ...,
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
        -2.22044605e-16,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00, -2.22044605e-16,  1.00000000e+00],
       [ 1.00000000e+00,  9.37113882e-01,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00]])

## Elbow methods

In [210]:
from tqdm import tqdm

In [211]:
# elbow score plot with Yellowbrick
def elbowplot(df, elbowmetric, model):
    print("Elbow Score Plot (" + str(elbowmetric) + " metric):")
    vis = KElbowVisualizer(
        model, 
        k=(2,nK), 
        metric=elbowmetric,
        locate_elbow=True, 
        timings=False)
    vis.fit(df)      
    print("elbow value = optimal k:", f'{vis.elbow_value_:.0f}', \
            " | elbow score:", f'{vis.elbow_score_:,.3f}')
    vis.show() 

## KMeans

In [265]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [267]:
output = { 'subject': list(data_subj.values), 'cluster': clusters }

data_hsl = pd.DataFrame(films, index = [clusters] , columns = ['subject','cluster'])
data_hsl

Unnamed: 0,subject,cluster
9,[Closing Engine] Kota Denpasar 2024 - Jalur Zo...,8
1,[Kendala] Siswa Masih Bisa Langsung Pengajuan ...,4
9,[Closing Engine] Prov NTT 2024 - SMA All Jalur...,8
9,[DEV] Pengecekan dan Penyesuaian hasil seleksi...,8
6,Pengecekan File Ajuan Pendaftaran di Operator ...,5
...,...,...
6,Reset daftar Ajuan Pendaftaran Gabungan zona Demo,5
4,Override Engine Seleksi Kota Batu,4
8,Dokumen SPH PPDB Online Kota Palangkaraya 2023,6
4,Override Info Dari Formatter Kapasitas Jika Ya...,4


In [268]:
data_hsl['cluster'].value_counts() #number of article per cluster (clusters from 0 to 4)

cluster
4    640
2    187
5    183
7    180
1    175
9    171
6    153
8    152
3    135
0     60
Name: count, dtype: int64

In [269]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print(f"Cluster {i} topic:", end='')
    for ind in order_centroids[i, :1]:  # Replace 6 with n_words_per_cluster
        word = terms[ind].split(' ')
        if word[0] in vocab_frame.index:
            print(f" {vocab_frame.loc[word[0]].values.tolist()[0][0].encode('utf-8', 'ignore').decode('utf-8')},", end='')
    print()  # Add whitespace
    print()  # Add whitespace

    print(f"Cluster {i} subject:", end='')
    if i in frame.index:
        for title in frame.loc[i]['subject'].values.tolist():
            print(f" {title},", end='')
    print()  # Add whitespace
    print()  # Add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 topic:

Cluster 0 subject: ST - koreksi alur, [Kab. Barito Selatan] Penambahan Gambar Banner, Mengganti Gambar Banner, ST - [Req] Pembuatan Poster/banner Pena Berkah Kalteng, ST - Alur Prov. Bali, ST - Pengecekan SSL Custom Domain PPDB Kota Serang 2024, ST - Menambah Storage pada VM database slave, ST - Merekayasa banner bengkulu, ST - custom domain ppdb.jepara.go.id dari https://jepara.siap-ppdb.com/#/, ST - Cek kesesuaian SSL untuk domain PPDB Jepara, ST - Upgrade spek 10.23.2.245, ST - Banner Situs Publik PPDB Online Prov. Kalteng 2024, mangganti Background Banner, ST - ganti foto banner kab bangka, pasang Gambar Alur, Update Banner [2], Pengecekan SSL Custom Domain PPDB Prov. Banten 2024, Banner Situs Publik PPDB Online Prov. Banten 2024, Banner Baru PPDB Online Prov. Banten 2024, Pemasangan URL pada tombol Berkas Alur Gambar, Perubahan Gambar Alur Pelaksanaan SD dan SMP di situs Publik, [Kab. Barito Selatan] Penambahan Gambar Alur Model B+, [Banne

### Menamakan Cluster

In [None]:
cluster_names = {0: 'Penambahan Poster/Banner', 
                 1: 'Migrasi Data', 
                 2: 'Father', 
                 3: 'Up Situs Publik', 
                 4: 'Killed'}