<a href="https://colab.research.google.com/github/SeohyeonSunny/Topic-Modeling/blob/main/BERTopic_%26_Topic_Coherence_Score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install bertopic



In [None]:
pip install kiwipiepy



In [None]:
from kiwipiepy import Kiwi
from kiwipiepy.utils import Stopwords
import re
import csv
from sklearn.datasets import fetch_20newsgroups
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

In [None]:
kiwi = Kiwi()
stopwords = Stopwords()

In [None]:
kiwi.add_user_word('난민법', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('영주권', 'NNP', 0)

False

In [None]:
kiwi.add_user_word('헬조선', 'NNP', 0)

False

In [None]:
kiwi.add_user_word('제노포비아', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('차별금지법', 'NNP', 0)

False

In [None]:
kiwi.add_user_word('무사증', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('무비자', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('인도적', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('법제사법위원회', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('출입국청', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('일베', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('물타기', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('무대책', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('무대응', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('고용허가제', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('감성팔이', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('개구멍', 'NNP', 0)

True

In [None]:
kiwi.add_user_word('건강보험', 'NNP', 0)

False

In [None]:
file_path = '/content/gdrive/MyDrive/Colab Notebooks/Twitter_refugee_after.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read().splitlines()

In [None]:
stop_words_2 = '''
이거
그거
그건
그걸
이걸
이건
이것
이걸
그것
거기
누구
무엇
관련
우선
때문'''
stop_words_list2 = stop_words_2.split('\n')

In [None]:
user_stop_word = stop_words_list2

In [None]:
extract_pos_list = ["NNG", "NNP", "NNB", "NR", "NP"]

In [None]:
class CustomTokenizer:
    def __init__(self, kiwi):
        self.kiwi = kiwi

    def __call__(self, text):
        result = list()
        for word in self.kiwi.tokenize(text):
            # 명사이고, 길이가 2이상인 단어이고, 불용어 리스트에 없으면 추가하기
            if word[1] in extract_pos_list and len(word[0]) > 1 and word[0] not in user_stop_word:
                result.append(word[0])
        return result

In [None]:
custom_tokenizer = CustomTokenizer(kiwi)

In [None]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=300)

In [None]:
model = BERTopic(embedding_model="sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens", \
                 vectorizer_model=vectorizer,
                 nr_topics = "96", # 문서를 대표하는 토픽의 갯수
                 top_n_words=100,
                 calculate_probabilities=True)

In [None]:
topics, probs = model.fit_transform(text)

In [None]:
model.visualize_topics()

In [None]:
model.visualize_distribution(probs[0])

In [None]:
for i in range(0, 50):
  print(i,'번째 토픽 :', model.get_topic(i))

0 번째 토픽 : [('뉴스', 0.01950253416600743), ('네이버', 0.01746178196095664), ('출처', 0.0173941834113786), ('다음', 0.014921368282522057), ('신청', 0.014557621372188878), ('허가', 0.013686172461023287), ('개헌', 0.01313517313271967), ('출입국', 0.012631667652453778), ('입국', 0.01216843393002804), ('제주', 0.012081690636859346), ('종합', 0.01202063657008961), ('무사증', 0.01113186611655469), ('불법', 0.01097597982393408), ('연합뉴스', 0.010373365387880135), ('인정', 0.010019481476177252), ('청원', 0.009970099390185663), ('신청자', 0.009906261780908138), ('논란', 0.009782113272807336), ('폐지', 0.009641004097753062), ('르포', 0.00963862865440067), ('제주도', 0.00918379340128404), ('난민', 0.009001852192168343), ('예멘', 0.008850831991150021), ('외국인', 0.008850589623953069), ('난민법', 0.008766983671229436), ('체류', 0.008755235104521407), ('심사', 0.00822305206458936), ('마약', 0.008160521789971526), ('내전', 0.007764816593243389), ('브로커', 0.007647366883950909), ('대통령', 0.007601001066769316), ('인도적', 0.007425751862423011), ('가짜', 0.006839996787324692),

In [None]:
model.visualize_barchart()

In [None]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2974,-1_난민_예멘_국민_제주,"[난민, 예멘, 국민, 제주, 한국, 반대, 난민법, 문제, 사람, 우리, 나라, ...","[제주 예멘 난민에 국민청원 ' 봇물' …"" 난민 수용 반대 "" (출처: SBS ..."
1,0,1062,0_뉴스_네이버_출처_다음,"[뉴스, 네이버, 출처, 다음, 신청, 허가, 개헌, 출입국, 입국, 제주, 종합,...","[제주도 불법 난민 신청 문제에 따른 난민법, 무사증 입국, 난민신청 허가 폐지/개..."
2,1,304,1_여자_여성_남자_혐오,"[여자, 여성, 남자, 혐오, 폭행, 남성, 흉기, 차별, 일부, 새끼, 예멘, 생...","[와 이런 글이 알티나는 게 신기하네. 난민 혐오? 결혼, 임신이 디폴트인 이슬람 ..."
3,2,226,2_르포_난민법_제주_시작,"[르포, 난민법, 제주, 시작, 난민, 단독, 다음, 답변, 어디, 얘기, 공개, ...","[[르포]거의 실명인데 큰 병원 좀. . 제주에 ' 갇힌 ' 난민들 | 다음뉴스, ..."
4,3,191,3_사람_생각_자기_불안,"[사람, 생각, 자기, 불안, 문제, 여자, 대통령, 도민, 사건, 답변, 수용, ...",[이 나라는 자국민을 보호할 생각이 없다 하긴 자기들은 안 당할 거라고 생각하니까 ...
...,...,...,...,...,...
92,91,12,91_일본_사건_외국_한국인,"[일본, 사건, 외국, 한국인, 전쟁, 피해, 지금, 강제, 중국, 우리, 제주도,...","[4·3 직후에 일본으로 탈출한 제주도민들은 난민이 됐다, . . ' 일본은 그 난..."
93,92,12,92_범죄_정부_경제_시민,"[범죄, 정부, 경제, 시민, 인종, 나라, 우리, 국제, 차별, 외국인, 기간, ...",[베트남 정부처럼 나라의 세금 처 먹는 것들이 죄를 지으면 자손까지 영향받게 범죄인...
94,93,11,93_동영상_유튜브_공유_답변,"[동영상, 유튜브, 공유, 답변, 청원, 청와대, 논란, 집회, 취재, 체크, 팩트...",[유튜브 동영상 제주 예멘 난민 논란에 대해 예멘인 이 직접 입을 열다[렛츠 길잇]...
95,94,11,94_청와대_답변_박상기_장관,"[청와대, 답변, 박상기, 장관, 법무부, 청원, 페이스북, 위원장, 오늘, 난민법...",[[청와대 Live] 11:50 청와대입니다[ ‘난민법 관련’청원에 박상기 법무부 ...


In [None]:
from scipy.cluster import hierarchy as sch

In [None]:
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = model.hierarchical_topics(text, linkage_function=linkage_function)

100%|██████████| 95/95 [00:01<00:00, 91.08it/s]


In [None]:
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─■──페미니즘_경계_센터_공유_제주 ── Topic: 83
└─예멘_제주_난민_허가_난민법
     ├─예멘_제주_난민_허가_폐지
     │    ├─■──적극_조치_강화_인도_신청자 ── Topic: 47
     │    └─예멘_제주_난민_허가_폐지
     │         ├─예멘_제주_난민_허가_폐지
     │         │    ├─■──센터_호소_단체_아시아_문화 ── Topic: 81
     │         │    └─예멘_제주_난민_허가_폐지
     │         │         ├─예멘_제주_난민_허가_폐지
     │         │         │    ├─■──카페_세계_대한민국_이유_다음 ── Topic: 77
     │         │         │    └─예멘_제주_난민_허가_폐지
     │         │         │         ├─■──독일_마음_주장_수용_문화 ── Topic: 44
     │         │         │         └─예멘_제주_난민_허가_폐지
     │         │         │              ├─예멘_제주_난민_허가_폐지
     │         │         │              │    ├─예멘_제주_난민_허가_폐지
     │         │         │              │    │    ├─■──인터뷰_당신_이야기_제주_중동 ── Topic: 72
     │         │         │              │    │    └─예멘_제주_난민_허가_폐지
     │         │         │              │    │         ├─예멘_제주_난민_허가_신청
     │         │         │              │    │         │    ├─■──찬성_수용_반대_조사_지지 ── Topic: 40
     │         │   

In [None]:
!pip install gensim



In [None]:
# Import necessary modules
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Check available topics
available_topics = model.get_topics()

# Safely extract topics
topic_words = []
for topic_id in available_topics:
    topic = model.get_topic(topic_id)
    if topic and isinstance(topic, list):
        topic_words.append([word for word, _ in topic])


In [None]:
# Preprocess texts using your custom tokenizer
processed_texts = [custom_tokenizer(text) for text in text]

# Create a Gensim dictionary from the processed texts
dictionary = Dictionary(processed_texts)

# Convert processed texts to bag-of-words format using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in processed_texts]

# Initialize CoherenceModel
coherence_model = CoherenceModel(topics=topic_words, texts=processed_texts, corpus=corpus, dictionary=dictionary, coherence='c_v')

# Calculate the coherence score
coherence_score = coherence_model.get_coherence()
print('Coherence Score:', coherence_score)

Coherence Score: 0.3434977803008295


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the same embedding model used in BERTopic
embedding_model = SentenceTransformer("sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

def calculate_embedding_coherence(topic_words, embedding_model):
    word_embeddings = embedding_model.encode(topic_words)
    similarity_matrix = cosine_similarity(word_embeddings)
    # Exclude the diagonal elements when calculating the average similarity
    np.fill_diagonal(similarity_matrix, 0)
    mean_coherence = np.mean(similarity_matrix) / (similarity_matrix.shape[0] - 1)
    return mean_coherence

# Extract the top words for each topic
num_topics = len(set(topics)) - 1  # Assuming 'topics' contains your topic assignments
topic_coherence_scores = []

for topic_id in range(num_topics):
    topic = model.get_topic(topic_id)
    if topic:  # Check if topic is not None
        top_words = [word for word, _ in topic]
        coherence_score = calculate_embedding_coherence(top_words, embedding_model)
        topic_coherence_scores.append((topic_id, coherence_score))

# Displaying the coherence scores
for topic_id, coherence_score in topic_coherence_scores:
    print(f"Topic {topic_id}: Coherence Score = {coherence_score}")


Topic 0: Coherence Score = 0.005523541358986286
Topic 1: Coherence Score = 0.005310084482636115
Topic 2: Coherence Score = 0.0060798700409706195
Topic 3: Coherence Score = 0.006110287675953874
Topic 4: Coherence Score = 0.005926823977268104
Topic 5: Coherence Score = 0.0055412066103232025
Topic 6: Coherence Score = 0.005896065572295526
Topic 7: Coherence Score = 0.006163638047497682
Topic 8: Coherence Score = 0.0055202432353087146
Topic 9: Coherence Score = 0.005879691152861624
Topic 10: Coherence Score = 0.006629138281851104
Topic 11: Coherence Score = 0.005139762705022638
Topic 12: Coherence Score = 0.005553368366125858
Topic 13: Coherence Score = 0.006186376316378815
Topic 14: Coherence Score = 0.00613810739131889
Topic 15: Coherence Score = 0.006010817156897651
Topic 16: Coherence Score = 0.005654312745489255
Topic 17: Coherence Score = 0.0054672330316871106
Topic 18: Coherence Score = 0.005905248902060769
Topic 19: Coherence Score = 0.006580444297405204
Topic 20: Coherence Score =

In [None]:
def jaccard_similarity(set1, set2):
    """Calculate the Jaccard Similarity between two sets."""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Extract the top words for each topic
topic_words = {topic_id: set(word for word, _ in model.get_topic(topic_id))
               for topic_id in range(len(set(topics)) - 1)}

# Calculate Jaccard Similarities
jaccard_similarities = []
for topic_id1 in topic_words:
    for topic_id2 in topic_words:
        if topic_id1 < topic_id2:
            similarity = jaccard_similarity(topic_words[topic_id1], topic_words[topic_id2])
            jaccard_similarities.append(similarity)

# Calculate Topic Diversity
topic_diversity = 1 - np.mean(jaccard_similarities)
print(f"Topic Diversity: {topic_diversity}")

Topic Diversity: 0.8340820025847506
