In [9]:
import pandas as pd
import os
import glob
from konlpy.tag import Okt, Kkma, Komoran
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
def createDirectory(directory): 
    """ 새로운 폴더를 생성하는 함수 """
    try: 
        if not os.path.exists(directory): 
            os.makedirs(directory) 
    except OSError: 
        print("Error: Failed to create the directory.")

In [20]:
def get_topics(components, feature_names, n=10):
    topic_all = []
    for idx, topic in enumerate(components):
        topic_all.append([[idx, feature_names[i], topic[i].round(2)] for i in topic.argsort()[:-n - 1:-1]])
    return topic_all

In [25]:
class LDA:
    """ Latent Dirichlet Allocation 이용. """
    def __init__(self, f, stop):
        self.df = f
        self.stopword = stop
    def preprocessing(self):
        """
        Okt 이용하여 명사만 추출한 뒤, 불용어사전에 없고 단어 길이 >= 2 인 경우만 추가.
        """
        okt = Okt()
        text = self.df.copy()[['발언내용']]
        text['발언내용'] = text.apply(lambda row: okt.nouns(row['발언내용']), axis = 1)
        tokenized_doc = text['발언내용'].apply(lambda x: [w for w in x if w not in (self.stopword) and len(w) >= 2])
        
        return tokenized_doc
    def topic_modeling(self, t):
        """ 
        역토큰화 시킨 다음, LDA 모델을 이용하여 토픽 모델링 실시. 
        """
        re_tokenized = [' '.join(t[i]) for i in range(len(self.df))]
    
        vectorizer = TfidfVectorizer(stop_words = self.stopword, max_features = 1000)
        X = vectorizer.fit_transform(re_tokenized)
        lda_model = LatentDirichletAllocation(n_components = 10, learning_method = 'online', random_state = 777, max_iter = 25)
        lda_top = lda_model.fit_transform(X)
        terms = vectorizer.get_feature_names_out()
        result = get_topics(lda_model.components_,terms)
        print(result)
        

In [26]:
location = os.getcwd()

fname_list = os.listdir(os.path.join(location, "회의록_모음"))
file_list = glob.glob(os.path.join(location, "회의록_모음/*"))

# 원하는 폴더 이름 설정.
dir_name = "LDA"
createDirectory(dir_name)
stopwords_1_2 = pd.read_excel("불용어사전_통합본.xlsx")

for i in range(len(fname_list)):
    df = pd.read_csv(file_list[i], encoding = 'euc-kr')
    lda = LDA(df, stopwords_1_2)
    tokenized = lda.preprocessing()
    lda.topic_modeling(tokenized)
    break

[[[0, '다시', 41.55], [0, '징계', 34.71], [0, '계약', 32.96], [0, '업체', 32.67], [0, '교장', 29.9], [0, '건가', 28.46], [0, '선생님', 28.28], [0, '성언', 27.25], [0, '대해', 24.1], [0, '방과후', 23.81]], [[1, '시설', 56.25], [1, '안전', 35.38], [1, '집행', 31.7], [1, '시오', 27.28], [1, '이지', 26.57], [1, '기관', 24.93], [1, '예산', 24.66], [1, '사업', 24.38], [1, '별로', 23.96], [1, '사고', 23.9]], [[2, '그게', 40.13], [2, '형주', 25.48], [2, '여러', 22.82], [2, '별도', 21.25], [2, '지원', 20.96], [2, '제출', 20.81], [2, '체크', 20.79], [2, '자리', 19.36], [2, '이해', 18.91], [2, '지침', 18.6]], [[3, '원장', 50.24], [3, '거지', 44.43], [3, '인원', 36.15], [3, '부위', 34.55], [3, '정원', 34.16], [3, '장인', 30.39], [3, '미달', 20.49], [3, '설명', 19.92], [3, '차이', 18.51], [3, '서울시', 17.8]], [[4, '때문', 71.57], [4, '정도', 68.39], [4, '대해', 56.15], [4, '실장', 55.65], [4, '담당', 49.86], [4, '행정', 46.02], [4, '거기', 37.47], [4, '인력', 35.13], [4, '협력', 31.61], [4, '예산', 31.01]], [[5, '예산', 71.85], [5, '운영', 38.12], [5, '편성', 30.99], [5, '위탁', 29.7], [5, '절차', 27.57], [5