In [None]:
import sys
import os

if 'root_dir' not in globals():
    # rootディレクトリへのパスを設定
    root_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
    os.chdir(root_dir)


In [None]:
import pandas as pd

#データベースへ接続するエンジンを作成
from my_codes.database_setting import Engine
from my_codes.database_setting import Base

#データベースのテーブルとマッピングする
from my_codes.notes_database import Notes

from sqlalchemy.orm import sessionmaker
from sqlalchemy import func

#セッションを作成
Session = sessionmaker(bind=Engine)
session = Session()

In [None]:
# ORMを使って特定のカラムを取得
num_samples = 10000  # サンプリングするデータの数
result = session.query(Notes.key, Notes.tokenized_body).order_by(func.random()).limit(num_samples).all()
session.close()
# リストをDataFrameに変換
data = pd.DataFrame(result, columns=['key','tokenized_body'])

In [None]:
data

In [None]:
# データの形を確認
print(data.head())
print(data.shape)

In [None]:
type(data.tokenized_body[0])

In [None]:
import ast  # For converting string representation of lists to actual lists

# Convert the string representation of lists in 'tokenized_body' to actual lists
data['tokenized_body'] = data['tokenized_body'].apply(ast.literal_eval)

# Display the transformed data to ensure correct conversion
data.head()


In [None]:
type(data['tokenized_body'][0][0])

In [None]:
from gensim import corpora, models
# !pip install scipy==1.12
# 最新のscipyのバージョンだとtriuがうまくダウンロードできないので、バージョンを指定してインストールする

# Prepare the list of tokens for gensim
texts = data['tokenized_body'].tolist()

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Filter out words that occur less than 20 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Convert document into the bag-of-words (BoW) format = list of (token_id, token_count)
corpus = [dictionary.doc2bow(text) for text in texts]

# Set up the LDA model
lda_model = models.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=10)

# Show the topics with their terms
topics = lda_model.print_topics(num_words=10)
topics


In [None]:
from wordcloud import WordCloud

import matplotlib.pyplot as plt

# 日本語フォントのパスを指定
font_path = '/Library/Fonts/Arial Unicode.ttf'

# Create a grid of subplots
fig, axes = plt.subplots(nrows=4, ncols=5, figsize=(20, 16))

# Iterate over the topics and plot word clouds in subplots
for i, topic in enumerate(topics):
    # Concatenate the words in the topic
    topic_words = ' '.join(topic[1].split('*'))
    
    # Generate the word cloud
    wordcloud = WordCloud(font_path=font_path, width=400, height=200, colormap='Set2', background_color='white').generate(topic_words)
    
    # Plot the word cloud in the corresponding subplot
    ax = axes[i // 5, i % 5]
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(f'Topic {topic[0]}')
    ax.axis('off')

# Adjust the spacing between subplots
plt.tight_layout()

# Display the combined image of word clouds
plt.show()




In [None]:
topics[0]

In [None]:
from gensim import similarities
import numpy as np


# 各トピックのトピック分布ベクトルを取得
topic_distributions = lda_model.get_topics()

# トピック分布をコーパス形式に変換
corpus_topic_distributions = [list(enumerate(topic)) for topic in topic_distributions]

# 類似度マトリックスの初期化
similarity_matrix = np.zeros((len(topic_distributions), len(topic_distributions)))

# トピック間の類似度計算
index = similarities.MatrixSimilarity(corpus_topic_distributions, num_features=topic_distributions.shape[1])
for i, topic_dist in enumerate(corpus_topic_distributions):
    sims = index[topic_dist]
    similarity_matrix[i] = sims

# 類似度マトリックスを表示
print(similarity_matrix)



In [None]:
# ヒートマップを作成して表示
plt.figure(figsize=(8, 6))
plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
plt.colorbar(label='Similarity')
plt.title('Topic Similarity Matrix')
plt.xlabel('Topic')
plt.ylabel('Topic')
plt.xticks(range(len(topics)), [f'Topic {i}' for i in range(len(topics))], rotation=90)
plt.yticks(range(len(topics)), [f'Topic {i}' for i in range(len(topics))])
plt.show()

In [None]:
# ドキュメントごとのトピック分布を取得
doc_topic_dist = lda_model.get_document_topics(corpus, minimum_probability=0)

# トピック分布をデータフレームに変換
doc_topic_dist = pd.DataFrame(doc_topic_dist)



In [None]:
# トピック分布の確認
doc_topic_dist.head()

In [None]:
# トピックの割合を取得
doc_topic_dist['topic'] = doc_topic_dist.idxmax(axis=1)