In [None]:
import sys
import os

if 'root_dir' not in globals():
    # rootディレクトリへのパスを設定
    root_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
    os.chdir(root_dir)

In [None]:
import pandas as pd

#データベースへ接続するエンジンを作成
from my_codes.database_setting import Engine
from my_codes.database_setting import Base

#データベースのテーブルとマッピングする
from my_codes.notes_database import Notes

from sqlalchemy.orm import sessionmaker
from sqlalchemy import func

#セッションを作成
Session = sessionmaker(bind=Engine)
session = Session()

In [None]:
# 検索語を指定して，特定の用語を含むノートを検索
search_word = '料理'
num_samples = 10000
result = session.query(Notes.key, Notes.urlname, Notes.created_at,Notes.tokenized_body).filter(Notes.tokenized_body.like(f'%{search_word}%')).order_by(func.random()).limit(num_samples).all()
session.close()

data = pd.DataFrame(result ,columns=['key','urlname', 'created_at', 'tokenized_body'])

In [None]:
data.created_at.describe()
# data.created_atを月単位に変更する
data['created_at'] = pd.to_datetime(data['created_at'])
data['created_at'] = data['created_at'].dt.to_period('M')
data.created_at.describe()

In [None]:
# 月毎のノート数をカウント
data.created_at.value_counts().sort_index().plot(kind='bar')

In [None]:
data.urlname.unique().shape

In [None]:
import pandas as pd
from collections import Counter
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
import numpy as np

In [None]:
# RとPythonのデータフレームの相互変換を有効化
pandas2ri.activate()
rpy2.robjects.numpy2ri.activate()

In [None]:
data.created_at

In [None]:
# 語彙リストを作成
vocab = list(set([word for doc in data['tokenized_body'] for word in doc]))

In [None]:
# 各文書をインデックスと頻度のリストに変換
# 各文書をインデックスと頻度のリストに変換
documents = []
total_doc_n = len(data['tokenized_body'])
for i, doc in enumerate(data['tokenized_body'], start=1):
    word_counts = Counter(doc)
    indices = [vocab.index(word) for word in word_counts.keys()]
    counts = list(word_counts.values())
    documents.append(list(zip(indices, counts)))
    
    if i % 1000 == 0:
        print(f"Processed {i} out of {total_doc_n} documents.")

In [None]:
# Rのリスト形式に変換
documents_r = ro.ListVector([(i, ro.IntVector([item[0] for item in doc])) for i, doc in enumerate(documents)])
counts_r = ro.ListVector([(i, ro.IntVector([item[1] for item in doc])) for i, doc in enumerate(documents)])


In [None]:
# RのSTMパッケージをインポート
stm = importr('stm')

In [None]:
# データの変換
metadata = pandas2ri.py2rpy(data[['created_at']].astype(str))

In [None]:
# トピックモデルの作成
model = stm.stm(documents=documents_r, vocab=vocab, K=3, prevalence=metadata, data=metadata)

In [None]:
import matplotlib.pyplot as plt

# トピックの表示
ro.r('print')(model)

# トピック分布の可視化
def plot_topics(model):
    topics = ro.r('summary')(model)
    num_topics = len(topics.names)
    for i in range(num_topics):
        topic_words = topics[i]
        plt.figure()
        plt.barh(range(len(topic_words)), topic_words, align='center')
        plt.yticks(range(len(topic_words)), topic_words.names)
        plt.title(f'Topic {i + 1}')
        plt.show()

plot_topics(model)
