In [1]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

# LOADING THE DATASET

In [2]:
df=pd.read_csv('techno.csv')

In [3]:
df.head()

Unnamed: 0,nama,tanggal,deskripsi,kategori
0,"Banjir Bandang di Kutai Timur Meluas, 15.000 J...","Minggu, 20 Maret 2022 - 21:54 WIB","Ratusan rumah di Sanggata, Kutai Timur, Kalima...",Nusantara
1,"Tak Kebagian Bus, Penonton MotoGP Tertinggal d...","Minggu, 20 Maret 2022 - 21:50 WIB",Para penonton MotoGP di Mandalika yang awalnya...,Nusantara
2,"Pro Kontra Daerah Otonomi Baru, Masyarakat Ada...","Minggu, 20 Maret 2022 - 21:39 WIB",Setelah beberapa wilayah adat La Pago menolak ...,Nusantara
3,"Pulihkan Pariwisata, Denpasar Gratiskan Dokar ...","Minggu, 20 Maret 2022 - 21:35 WIB",Bali terus melakukan upaya untuk memulihkan pa...,Nusantara
4,"Puncak HUT, Bupati Tulang Bawang Winarti Resmi...","Minggu, 20 Maret 2022 - 21:07 WIB",Pada perhelatan Puncak Acara HUT Kabupaten Tul...,Sponsored


In [4]:
# drop the nama.
df.drop(['nama'],axis=1,inplace=True)

In [5]:
# drop the tanggal.
df.drop(['tanggal'],axis=1,inplace=True)

In [6]:
# drop the kategori.
df.drop(['kategori'],axis=1,inplace=True)

In [7]:
df.head(10)

Unnamed: 0,deskripsi
0,"Ratusan rumah di Sanggata, Kutai Timur, Kalima..."
1,Para penonton MotoGP di Mandalika yang awalnya...
2,Setelah beberapa wilayah adat La Pago menolak ...
3,Bali terus melakukan upaya untuk memulihkan pa...
4,Pada perhelatan Puncak Acara HUT Kabupaten Tul...
5,Indonesia International Taekwon Do Federation ...
6,Indonesia International Taekwon Do Federation ...
7,Kapolres Malang AKBP Ferli Hidayat terus berup...
8,Seorang remaja perempuan berjilbab yang diduga...
9,Hujan deras disertai angin kencang mengakibatk...


# DATA CLEANING & PRE-PROCESSING

In [8]:
def clean_text(deskripsi):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(deskripsi)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [9]:
# time taking
df['deskripsi_text']=df['deskripsi'].apply(clean_text)

In [10]:
df.head()

Unnamed: 0,deskripsi,deskripsi_text
0,"Ratusan rumah di Sanggata, Kutai Timur, Kalima...",Ratusan rumah Sanggata Kutai Timur Kalimantan ...
1,Para penonton MotoGP di Mandalika yang awalnya...,Para penonton MotoGP Mandalika yang awalnya ke...
2,Setelah beberapa wilayah adat La Pago menolak ...,Setelah beberapa wilayah adat Pago menolak kin...
3,Bali terus melakukan upaya untuk memulihkan pa...,Bali terus melakukan upaya untuk memulihkan pa...
4,Pada perhelatan Puncak Acara HUT Kabupaten Tul...,Pada perhelatan Puncak Acara Kabupaten Tulang ...


In [11]:
df.drop(['deskripsi_text'],axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,deskripsi
0,"Ratusan rumah di Sanggata, Kutai Timur, Kalima..."
1,Para penonton MotoGP di Mandalika yang awalnya...
2,Setelah beberapa wilayah adat La Pago menolak ...
3,Bali terus melakukan upaya untuk memulihkan pa...
4,Pada perhelatan Puncak Acara HUT Kabupaten Tul...


In [13]:
df['deskripsi'][0]

'Ratusan rumah di Sanggata, Kutai Timur, Kalimantan Timur terendam banjir hingga kedalaman setengah meter dan menyebabkan warga kesulitan beraktivitas.'

# EXTRACTING THE FEATURES AND CREATING THE DOCUMENT-TERM-MATRIX ( DTM )

In [14]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)

In [15]:
vect_text=vect.fit_transform(df['deskripsi'])

In [16]:
print(vect_text.shape)
print(vect_text)

(27, 393)
  (0, 54)	0.2381987790450015
  (0, 175)	0.2381987790450015
  (0, 386)	0.16628781911729804
  (0, 239)	0.21165858334357096
  (0, 86)	0.132851126901508
  (0, 243)	0.2381987790450015
  (0, 328)	0.2381987790450015
  (0, 166)	0.2381987790450015
  (0, 126)	0.21165858334357096
  (0, 40)	0.21165858334357096
  (0, 350)	0.2381987790450015
  (0, 154)	0.2381987790450015
  (0, 369)	0.476397558090003
  (0, 186)	0.2381987790450015
  (0, 310)	0.2381987790450015
  (0, 94)	0.07833208030007383
  (0, 305)	0.1782218911277809
  (0, 301)	0.21165858334357096
  (1, 36)	0.19817363898526325
  (1, 379)	0.24480218234663798
  (1, 163)	0.24480218234663798
  (1, 266)	0.24480218234663798
  (1, 64)	0.24480218234663798
  (1, 333)	0.1831626008535631
  (1, 58)	0.24480218234663798
  :	:
  (25, 239)	0.187305575686449
  (25, 126)	0.187305575686449
  (25, 94)	0.06931934988672944
  (25, 305)	0.1577160415149807
  (26, 375)	0.23528139867347048
  (26, 47)	0.23528139867347048
  (26, 216)	0.23528139867347048
  (26, 155)	0.

In [17]:
idf=vect.idf_

In [19]:
dd=dict(zip(vect.get_feature_names(), idf))
l=sorted(dd, key=(dd).get)
# print(l)
print(l[0],l[-1])
print(dd['setelah'])
print(dd['acara'])  # police is most common and forecast is least common among the news headlines.

di zainul
3.6390573296152584
3.6390573296152584


# TOPIC MODELLING Latent Semantic Analysis (LSA)

In [20]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

In [21]:
print(lsa_top)
print(lsa_top.shape)  # (no_of_doc*no_of_topics)

[[ 0.17827668  0.16444362  0.03513002  0.20425876 -0.25355449 -0.1119941
   0.49362253  0.12285648 -0.02200517  0.11271836]
 [ 0.55210584 -0.34590515 -0.24673958 -0.02254697 -0.17004537 -0.06049983
   0.04914273 -0.0654532   0.13706473 -0.04697636]
 [ 0.17477307  0.1813299  -0.04917827 -0.14869329  0.06412035  0.3004274
   0.19881324  0.18911249  0.43325654 -0.07036787]
 [ 0.12871282  0.06999455  0.14026957  0.15480653 -0.2304953   0.21874772
  -0.46327492  0.20987009 -0.17883513  0.19315305]
 [ 0.16982813  0.31138202 -0.08405119 -0.04722013 -0.27055507 -0.1046518
  -0.38181804 -0.06103384  0.21666076 -0.02684675]
 [ 0.24457475 -0.01143844  0.67467228 -0.306168    0.01380625 -0.24411452
   0.0142407  -0.05732031  0.01580008 -0.05999001]
 [ 0.32432618 -0.00992885  0.65078222 -0.31399171 -0.06241167 -0.04102204
  -0.02593243 -0.08429066  0.0775658  -0.08850706]
 [ 0.16181664 -0.00771368  0.16565851  0.03097217 -0.19286611  0.57882569
  -0.03389186 -0.19681352  0.1617974  -0.0354692 ]
 [ 

In [22]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)

Document 0 :
Topic  0  :  17.82766832374236
Topic  1  :  16.444362488274937
Topic  2  :  3.513002109980618
Topic  3  :  20.425875981865758
Topic  4  :  -25.35544853362419
Topic  5  :  -11.199410223991109
Topic  6  :  49.362252987809796
Topic  7  :  12.285647687952347
Topic  8  :  -2.200516740432844
Topic  9  :  11.271835630006418


In [23]:
print(lsa_model.components_.shape) # (no_of_topics*no_of_words)
print(lsa_model.components_)

(10, 393)
[[ 0.04816074  0.04716419  0.07326114 ...  0.07145266  0.20946864
   0.02516141]
 [ 0.08609482  0.11664807  0.17210955 ... -0.06195812  0.08889178
  -0.02721938]
 [-0.02351955 -0.0529399  -0.08347315 ... -0.04621108 -0.04344811
   0.00890173]
 ...
 [-0.09433628 -0.01180147 -0.0175854  ... -0.00190588  0.06455839
  -0.0049408 ]
 [-0.06167605  0.01901508 -0.02933119 ...  0.01366432  0.00401618
  -0.02693797]
 [ 0.00679673 -0.01261042 -0.00832885 ...  0.00720567  0.10728007
   0.05228102]]


In [24]:
# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
di mandalika sirkuit dan yang hujan motogp indonesia penonton pawang 

Topic 1: 
papua kabupaten 2022 warga baya bersenjata biru distrik kelompok membakar 

Topic 2: 
indonesia diri federation international itf taekwon terus dari dengan bela 

Topic 3: 
seorang warga pasar perempuan diduga anak tega kota salah tengah 

Topic 4: 
hujan tengah yang kini wanita terlihat pasar tidak pawang migor 

Topic 5: 
masyarakat terus dengan tersebut bahaya akbp berupaya daerah ferli hidayat 

Topic 6: 
timur rumah banjir hingga menyebabkan warga ratusan beraktivitas kalimantan kedalaman 

Topic 7: 
hal wilayah banjir itu akibat bersih bolango bone gorontalo ini 

Topic 8: 
adat kini di wilayah dan masyarakat bakal barat beberapa cikal 

Topic 9: 
halal padjadjaran ampel sunan surabaya ketua rumah center diselesaikan masih 



In [25]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42,max_iter=1) 
# n_components is the number of topics

In [26]:
lda_top=lda_model.fit_transform(vect_text)

In [27]:
print(lda_top.shape)  # (no_of_doc,no_of_topics)
print(lda_top)

(27, 10)
[[0.01991645 0.01991626 0.01991639 0.82075496 0.01991609 0.01991579
  0.01991624 0.01991609 0.01991578 0.01991594]
 [0.01898274 0.82914957 0.01898297 0.01898288 0.01898321 0.01898637
  0.01898309 0.01898307 0.01898286 0.01898324]
 [0.01824013 0.01824005 0.01824053 0.01824003 0.01824045 0.01823978
  0.01824029 0.0182399  0.83583887 0.01823998]
 [0.01889548 0.01889471 0.01889475 0.01889471 0.01889507 0.01889503
  0.82994598 0.01889487 0.01889488 0.01889453]
 [0.84270471 0.01747819 0.01747729 0.01747716 0.01747733 0.01747711
  0.01747705 0.01747696 0.01747726 0.01747693]
 [0.82997781 0.01889169 0.01889162 0.01889138 0.01889133 0.01889125
  0.018891   0.01889113 0.01889147 0.01889131]
 [0.0198695  0.01986963 0.0198697  0.01986993 0.01987026 0.01986955
  0.01986952 0.0198694  0.01986921 0.8211733 ]
 [0.01934933 0.01934918 0.01934898 0.01934922 0.01934919 0.82585827
  0.01934882 0.01934929 0.019349   0.01934871]
 [0.01942574 0.01942548 0.82516846 0.01942556 0.01942615 0.01942574
  0

In [28]:
sum=0
for i in lda_top[0]:
  sum=sum+i
print(sum)

1.0000000000000002


In [29]:
# composition of doc 0 for eg
print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  1.9916451307379586 %
Topic  1 :  1.991626232042231 %
Topic  2 :  1.9916393100208152 %
Topic  3 :  82.07549581560774 %
Topic  4 :  1.991609423362764 %
Topic  5 :  1.9915786100045656 %
Topic  6 :  1.9916244033450665 %
Topic  7 :  1.9916092420778224 %
Topic  8 :  1.9915776164659966 %
Topic  9 :  1.9915942163350586 %


In [30]:
print(lda_model.components_)
print(lda_model.components_.shape)  # (no_of_topics*no_of_words)

[[0.87031103 0.84876837 0.83835315 ... 0.93475491 0.69712398 0.72030291]
 [0.76777316 0.94598101 0.9718942  ... 0.8165008  0.83891585 0.79355052]
 [0.72293254 0.70521696 0.77618183 ... 0.83652559 0.87058736 0.88034058]
 ...
 [0.9448376  0.78093103 0.74968905 ... 0.89388721 0.77964696 0.89148505]
 [0.83186524 0.88664002 0.85168555 ... 0.78152503 0.84069983 0.91040302]
 [0.87008139 0.80023746 0.89104552 ... 0.87373621 0.91251683 0.89827736]]
(10, 393)


In [31]:
# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
hujan taekwon kabupaten menerjang vaksinasi biru terjadi immunity menganiaya terutama 

Topic 1: 
hal oiaa motogp immunity menyambut tengah al kegirangan nonton mandalika 

Topic 2: 
menggegerkan diduga baya kapolres anggota bolango salah dari acara hujan 

Topic 3: 
menyebabkan pembalap pandemi timur oleh alumni menerjang desa umur cikal 

Topic 4: 
bersenjata aksi kini papua menggenjot terutama tahun dilancarkan sindangkasih polres 

Topic 5: 
hidayat bahaya eri pembalap mandalika hingga berprikemanuasian tersendiri ribu menjadi 

Topic 6: 
zainul tahun pariwisata bali doa rider surabaya dan angin al 

Topic 7: 
bisa masih universitas deras berupaya harus dalam kapolres desa cukup 

Topic 8: 
bersenjata adat immunity pasang vaksinasi mendukung terjun padjadjaran muhtadun pemerintahan 

Topic 9: 
tengah wanita baru seminar seorang padjadjaran hujan heboh seluruh antusias 

