In [None]:
!pip install konlpy
!pip install pyLDAvis
!pip install wordcloud
!pip install gensim
!pip install pickles

In [None]:
import csv
import re
from konlpy.tag import Okt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
import pandas as pd

# CSV 파일 읽어오기
data = pd.read_csv('final.csv', encoding='UTF-8')
patents = data[['astrtCont', 'inventionTitle']].values.tolist()

# patents 리스트의 길이 확인
num_patents = len(patents)

print(f"특허 데이터의 개수: {num_patents}개")

In [None]:
patents = []
with open('final.csv', 'r', encoding='UTF-8') as file:  # CSV 파일 경로와 인코딩을 지정해주세요.
    reader = csv.reader(file)
    next(reader)  # 헤더 행 건너뛰기
    for row in reader:
        title = row[0]
        abstract = row[1]
        patents.append((title, abstract))

In [None]:
def clean_text(text):
    text = re.sub(r'[^가-힣\s]', '', text)  # 한글과 공백을 제외한 모든 문자 제거
    return text

cleaned_patents = []
for title, abstract in patents:
    cleaned_title = clean_text(title)
    cleaned_abstract = clean_text(abstract)
    cleaned_patents.append((cleaned_title, cleaned_abstract))

okt = Okt()

tokenized_patents = []
for title, abstract in cleaned_patents:
    tokens = [token for token, pos in okt.pos(title + ' ' + abstract) if pos.startswith(('N', 'V', 'A'))]
    tokenized_patents.append(tokens)

In [None]:
import pickle

In [None]:
# tokenized_patents 파일로 저장
tokenized_patents_file = '메모리_tokenized_patents.pkl'
with open(tokenized_patents_file, 'wb') as file:
    pickle.dump(tokenized_patents, file)
print("tokenized_patents 파일을 저장했습니다.")

In [None]:
# 저장된 tokenized_patents 파일 불러오기
with open(tokenized_patents_file, 'rb') as file:
    tokenized_patents = pickle.load(file)
print("저장된 tokenized_patents 파일을 불러왔습니다.")

In [None]:
# # 저장된 tokenized_patents 파일 불러오기
# with open('tokenized_patents.pkl', 'rb') as file:
#     tokenized_patents = pickle.load(file)
# print("저장된 tokenized_patents 파일을 불러왔습니다.")

In [None]:
from collections import defaultdict

# 전체 코퍼스에서 단어 빈도수 계산
word_counts = defaultdict(int)
for tokens in tokenized_patents:
    for token in tokens:
        word_counts[token] += 1

# 상위 N개 고빈도 단어 출력
top_n = 100
frequent_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
frequent_words = [word for word, count in frequent_words]
print(frequent_words)

In [None]:
# 고빈도 단어 결과 참고하여 불용어 추가 선정
stopwords = ['본', '발명', '은', '는', '이', '가', '의', '에', '을', '를', '것', '에서', '수', '등', '된', '및',
             '종', '와', '한', '하기', '위한', '경우', '있는', '있다', '대한', '하다', '되다', '하는',
             '상기', '제', '하여', '포함', '단계', '본', '것', '한다', '할', '생', '하나', '된다', '위', '관', '복수', '부', '또는', '적어도', '로부터',
             '될', '중', '통해', '예', '되어', '이상', '다',  '그', '함', '일', '층', '상', '출', '검', '점', '의해', '되고', '성하는', '사용자', '트']

In [None]:
# 도메인 특화된 불용어 사전 추가
domain_stopwords = ['특허', '기술', '장치', '방법', '시스템', '구성', '이용', '관련', '기반', '위해', '순서도', '도면', '예시', '실시', '다양', '적용', '대해', '되는',
                    '데이터', '값', '하우', '징', '획득', '결과', '각각', '제공', '정보']

filtered_patents = []
for tokens in tokenized_patents:
    filtered_tokens = [token for token in tokens if token not in stopwords]
    filtered_patents.append(filtered_tokens)

corpus = [' '.join(tokens) for tokens in filtered_patents]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)

dictionary = corpora.Dictionary(filtered_patents)
corpus = [dictionary.doc2bow(text) for text in filtered_patents]

In [None]:
# LDA 토픽 모델링 (토픽 수 범위 지정)
topic_range = range(9, 15)  # 토픽 수 범위 (5부터 20까지)
coherence_scores = []

for num_topics in topic_range:
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

    # 코히어런스 스코어 계산
    coherence_model = CoherenceModel(model=lda_model, texts=filtered_patents, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

    print(f"Number of Topics: {num_topics}, Coherence Score: {coherence_score:.4f}")

In [None]:
# 최적의 토픽 수 선택
best_topic_num = topic_range[coherence_scores.index(max(coherence_scores))]
print(f"Best Number of Topics: {best_topic_num}")


In [None]:
# 최적의 토픽 수로 LDA 모델 학습
lda_model = LdaModel(corpus, num_topics=best_topic_num, id2word=dictionary)

In [None]:
top_n = 5  # 상위 5개 단어 선정
topic_top_words = []
for i in range(lda_model.num_topics):
    topic_words = lda_model.get_topic_terms(i, topn=top_n)
    top_words = [dictionary[word_id] for word_id, _ in topic_words]
    topic_top_words.append(top_words)

In [None]:
print(topic_top_words)

In [None]:
# 토픽 출력
num_topics = best_topic_num  # 출력할 토픽의 수
num_words = 10  # 각 토픽에서 출력할 단어의 수

for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=num_words):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# pyLDAvis 준비
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

def get_word(word_id):
    return dictionary.get(word_id)

def get_topic_words(topic_id, top_n=10):
    topic_words = lda_model.get_topic_terms(topic_id, topn=top_n)
    return [get_word(word_id) for word_id, prob in topic_words]


In [None]:
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] =False

In [None]:
# 모든 토픽에 대해 상위 단어들을 리스트로 출력
for topic_id in range(lda_model.num_topics):
    print(f"Topic {topic_id+1}:")
    topic_words = get_topic_words(topic_id)
    print(", ".join(topic_words))
    print()

plt.figure(figsize=(20, 15))
for topic_id in range(lda_model.num_topics):
    topic_words = get_topic_words(topic_id)

    if not topic_words:
        print(f"Skipping Topic {topic_id} (no words assigned)")
        continue

    wordcloud = WordCloud(width=400, height=300, background_color='white', font_path='/content/NanumGothic.ttf').generate(' '.join(topic_words))

    plt.subplot(lda_model.num_topics // 3 + 1, 3, topic_id + 1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Topic {topic_id+1}')

plt.tight_layout()
plt.show()

In [None]:
# 문서-토픽 분포 구하기
doc_topic_dist = lda_model.get_document_topics(corpus)


In [None]:
# 토픽별 문서 수 계산
topic_doc_counts = [0] * lda_model.num_topics
for doc_topics in doc_topic_dist:
    for topic_id, prob in doc_topics:
        topic_doc_counts[topic_id] += 1

In [None]:
# 시각화
topics = range(1, lda_model.num_topics+1)
doc_counts = topic_doc_counts

colors = ['#4CAF50', '#FF9800', '#673AB7', '#E91E63', '#FFC107', '#03A9F4', '#795548', '#9C27B0']

plt.figure(figsize=(10, 6))
bars = plt.bar(topics, doc_counts, color=colors)

for bar in bars:
    height = bar.get_height()
    plt.gca().annotate('{}'.format(int(height)),
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom')

plt.xlabel('Topic', fontsize=12)
plt.ylabel('Document Count', fontsize=12)
plt.title('Topic Distribution', fontsize=16)
plt.show()

In [None]:
# 연도별 토픽 분포 변화 시각화
year_topic_dist = {}

for i, (title, abstract) in enumerate(patents):
    year = title[:4]  # 특허 제목에서 연도 추출 (예: 특허 제목 형식이 "2021년 특허명"인 경우)
    if year not in year_topic_dist:
        year_topic_dist[year] = [0] * best_topic_num

    doc_topics = lda_model.get_document_topics(corpus[i])
    for topic_id, prob in doc_topics:
        year_topic_dist[year][topic_id] += prob

In [None]:
# 연도별 토픽 분포 막대 그래프
years = list(year_topic_dist.keys())
topic_proportions = list(year_topic_dist.values())

fig, ax = plt.subplots(figsize=(12, 8))
x = range(len(years))
for i in range(best_topic_num):
    topic_prop = [prop[i] for prop in topic_proportions]
    ax.bar(x, topic_prop, width=0.8, label=f"Topic {i+1}")

ax.set_xticks(x)
ax.set_xticklabels(years, rotation=45, ha='right')
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("Proportion", fontsize=12)
ax.set_title("Topic Distribution by Year", fontsize=16)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# 연도별 토픽 분포 데이터 준비
years = sorted(list(year_topic_dist.keys()))
num_topics = len(year_topic_dist[years[0]])

# 색상 맵을 사용하여 colors 리스트 생성
cmap = plt.cm.get_cmap('tab20')
colors = [cmap(i) for i in range(num_topics)]

plt.figure(figsize=(16, 6))

for i in range(num_topics):
    topic_proportions = [year_topic_dist[year][i] for year in years]
    plt.plot(years, topic_proportions, marker='o', linestyle='-', color=colors[i], label=f'Topic {i+1}')

plt.xlabel('Year')
plt.ylabel('Proportion')
plt.title('Topic Distribution by Year')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import random

# 연도별 토픽 분포 데이터 준비
years = sorted(list(year_topic_dist.keys()))
num_topics = len(year_topic_dist[years[0]])

# 연도별 토픽 비중 계산
topic_proportions = {}
for year in years:
    topic_proportions[year] = [topic_count / sum(year_topic_dist[year]) for topic_count in year_topic_dist[year]]

# 색상을 동적으로 생성
colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(num_topics)]

plt.figure(figsize=(16, 8))
bar_width = 0.75
x = range(len(years))

bottom = [0] * len(years)
for i in range(num_topics):
    topic_props = [topic_proportions[year][i] for year in years]
    plt.bar(x, topic_props, width=bar_width, bottom=bottom, color=colors[i], label=f'Topic {i+1}')
    bottom = [b + p for b, p in zip(bottom, topic_props)]

plt.xlabel('Year')
plt.ylabel('Proportion')
plt.title('Topic Proportion by Year')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(range(len(years)), years, rotation=45)
plt.tight_layout()
plt.show()

# 추가: 연도별 토픽 비중 데이터를 CSV 파일로 저장
import csv
from datetime import datetime

now = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_file_path = 'topic_proportions_' + now + '.csv'
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Year'] + [f'Topic {i+1}' for i in range(num_topics)])  # 헤더 행 작성
    for year in years:
        writer.writerow([year] + topic_proportions[year])  # 연도별 토픽 비중 데이터 작성

In [None]:
import numpy as np
# 연도별 토픽 분포 개별 선그래프
# 연도별 토픽 분포를 저장할 딕셔너리
year_topic_dist = {}

# 연도별로 반복
for row_index, row in data.iterrows():
    year = str(row['application'])[:4]
    if year not in year_topic_dist:
        year_topic_dist[year] = [0] * lda_model.num_topics

    title = row['inventionTitle']
    abstract = row['astrtCont']
    doc = [token for token in tokenized_patents[row_index] if token in dictionary.token2id]
    doc_dist = lda_model.get_document_topics(bow=dictionary.doc2bow(doc))
    for topic_id, prob in doc_dist:
        year_topic_dist[year][topic_id] += prob

# 각 토픽별로 연도 변화 추이 시각화
years = sorted(year_topic_dist.keys())
num_topics = lda_model.num_topics
colors = plt.cm.rainbow(np.linspace(0, 1, num_topics))  # 토픽 개수에 따른 색상 팔레트 생성

fig, axes = plt.subplots(nrows=num_topics, ncols=1, figsize=(12, num_topics * 4), sharex=True)

for topic_id in range(num_topics):
    values = [year_topic_dist[year][topic_id] for year in years]

    ax = axes[topic_id]
    ax.plot(range(len(years)), values, marker='o', color=colors[topic_id], label=f"Topic {topic_id+1}")
    ax.set_ylim(0, 1600)  # y축 범위 설정 (0부터 1800까지)

    # x축에 연도 표시
    ax.set_xticks(range(len(years)))
    ax.set_xticklabels(years, rotation=45, ha='right', fontsize=12)

    ax.set_xlabel('Year', fontsize=14)
    ax.set_ylabel('Proportion', fontsize=14)
    ax.set_title(f'Topic {topic_id+1} Evolution', fontsize=16)
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
# 연도별 토픽 분포 개별 선그래프
# 연도별 토픽 분포를 저장할 딕셔너리
year_topic_dist = {}

# 연도별로 반복
for row_index, row in data.iterrows():
    year = str(row['application'])[:4]
    if year not in year_topic_dist:
        year_topic_dist[year] = [0] * lda_model.num_topics

    title = row['inventionTitle']
    abstract = row['astrtCont']
    doc = [token for token in tokenized_patents[row_index] if token in dictionary.token2id]
    doc_dist = lda_model.get_document_topics(bow=dictionary.doc2bow(doc))
    for topic_id, prob in doc_dist:
        year_topic_dist[year][topic_id] += prob

# 각 토픽별로 연도 변화 추이 시각화
years = sorted(year_topic_dist.keys())
num_topics = lda_model.num_topics
colors = plt.cm.rainbow(np.linspace(0, 1, num_topics))  # 토픽 개수에 따른 색상 팔레트 생성

fig, axes = plt.subplots(nrows=num_topics, ncols=1, figsize=(12, num_topics * 4), sharex=True)

for topic_id in range(num_topics):
    values = [year_topic_dist[year][topic_id] for year in years]

    ax = axes[topic_id]
    ax.plot(range(len(years)), values, marker='o', color=colors[topic_id], label=f"Topic {topic_id+1}")
    ax.set_ylim(0, 800)  # y축 범위 설정 (0부터 1800까지)

    # x축에 연도 표시
    ax.set_xticks(range(len(years)))
    ax.set_xticklabels(years, rotation=45, ha='right', fontsize=12)

    ax.set_xlabel('Year', fontsize=14)
    ax.set_ylabel('Proportion', fontsize=14)
    ax.set_title(f'Topic {topic_id+1} Evolution', fontsize=16)
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np

def get_word(word_id):
    return dictionary.get(word_id)

def get_topic_words(topic_id, top_n=10):
    topic_words = lda_model.get_topic_terms(topic_id, topn=top_n)
    return [get_word(word_id) for word_id, prob in topic_words]

def get_topic_word_dist(lda_model, top_n=10):
    num_topics = lda_model.num_topics
    vocab_size = len(lda_model.id2word)
    topic_word_dist = np.zeros((num_topics, vocab_size))
    for topic_id in range(num_topics):
        top_words = get_topic_words(topic_id, top_n)
        for word in top_words:
            word_id = lda_model.id2word.token2id.get(word, -1)
            if word_id != -1:
                topic_word_dist[topic_id, word_id] = 1
    return topic_word_dist

# 토픽-단어 분포 매트릭스 생성 (상위 10개 단어만 고려)
topic_word_dist = get_topic_word_dist(lda_model, top_n=10)

# k-means 클러스터링 수행
num_clusters = 5  # 클러스터 수 설정
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_assignments = kmeans.fit_predict(topic_word_dist)

# 클러스터별 토픽 출력
for cluster_id in range(num_clusters):
    cluster_topics = [i for i, c in enumerate(cluster_assignments) if c == cluster_id]
    print(f'Cluster {cluster_id}:')
    for topic_id in cluster_topics:
        top_words = get_topic_words(topic_id, top_n=5)
        print(f'  Topic {topic_id+1}: {", ".join(top_words)}')
    print()

# 클러스터 정보를 딕셔너리로 변환
cluster_dict = {}
for cluster_id in range(num_clusters):
    cluster_topics = [i for i, c in enumerate(cluster_assignments) if c == cluster_id]
    cluster_dict[cluster_id] = cluster_topics

# 네트워크 생성
G = nx.Graph()

# 노드 추가 (토픽 번호 = 노드 번호)
for topic_id in range(lda_model.num_topics):
    for cluster_id, topic_list in cluster_dict.items():
        if topic_id in topic_list:
            G.add_node(topic_id+1, label=f"Topic {topic_id+1}", cluster=cluster_id)
            break

# 엣지 추가 (토픽 간 유사도 기반)
for i in range(lda_model.num_topics):
    for j in range(i+1, lda_model.num_topics):
        similarity = np.dot(topic_word_dist[i], topic_word_dist[j]) / (np.linalg.norm(topic_word_dist[i]) * np.linalg.norm(topic_word_dist[j]))
        G.add_edge(i+1, j+1, weight=similarity)

# 클러스터별 노드 색상 설정
cluster_colors = {0: 'r', 1: 'g', 2: 'b', 3: 'c', 4: 'm'}
node_colors = [cluster_colors[G.nodes[node]['cluster']] for node in G.nodes()]

# 네트워크 시각화
pos = nx.spring_layout(G, k=0.5, iterations=50)
nx.draw(G, pos, with_labels=True, font_size=8, node_color=node_colors)
plt.show()