# レビューのソート

適切な評価指標が必要

In [52]:
# 必要なライブラリをインポート
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import json

# Jupyter Notebook用の設定
%matplotlib inline

In [None]:
# データのパス
data_path = 'data/processed/android_cleaned_mecab_with_topics.csv'

# データの読み込み
df = pd.read_csv(data_path)

# データの基本情報を確認
print(f"データサイズ: {df.shape}")
df.head()

In [None]:
import numpy as np
from sklearn.preprocessing import QuantileTransformer


# 日付をdatetime型に変換
# ISO8601形式に対応して日付を変換
df['date'] = pd.to_datetime(df['date'], format='ISO8601', errors='coerce')

# データ全体の最新の日付を取得
latest_date = df['date'].max()

# 各レビューに対して経過時間を計算（日数単位）
df['elapsed_days'] = (latest_date - df['date']).dt.days

# 経過日数が0の場合を防ぐために1日未満のデータを補正
df['elapsed_days'] = df['elapsed_days'].replace(0, 1)

# 共感性（Engagement）の計算
df['engagement'] = df['thumbsUp'] / df['elapsed_days']

# percentileを使って正規化

qt = QuantileTransformer(output_distribution='uniform', random_state=0)
df['engagement_quantile_normalized'] = qt.fit_transform(df[['engagement']])

# 結果を確認
df.head()

In [None]:
# レビューの情報量を計算する
# レビューの長さを文字数で計算

# レビューの長さを計算
df['review_length'] = df['text'].str.len()

# レビューの長さをpercentileで正規化

qt = QuantileTransformer(output_distribution='uniform', random_state=0)
df['review_length_quantile_normalized'] = qt.fit_transform(df[['review_length']])
                        
# 結果を確認
df.head()

In [56]:
import MeCab
# mecabrcのパスを明示的に指定
mecab = MeCab.Tagger("-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd -r /opt/homebrew/etc/mecabrc")
ochasen_tagger = MeCab.Tagger("-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd -r /opt/homebrew/etc/mecabrc")
def lemmatize_entities(entity_list, ochasen_tagger):
    for i, entity in enumerate(entity_list):
        node = ochasen_tagger.parse(entity)
        lines = node.split("\n")
        for line in lines[:-2]:  # 最後のEOSや空行を無視
            fields = line.split("\t")
            print(fields)  # デバッグ用
            
            if len(fields) >= 2:
                # fields[1]をカンマで分割
                features = fields[1].split(",")
                # features[6]が原形（基本形）
                if len(features) > 6:
                    entity_list[i] = features[6]
    return entity_list

def extract_effective_keywords(review_document, ochasen_tagger):
    review_keywords_list = {'proper_nouns': [], 'numbers': []}
    node = ochasen_tagger.parse(review_document)
    lines = node.split("\n")

    for line in lines:
        if line == "EOS" or line.strip() == "":
            continue
        fields = line.split("\t")
        if len(fields) < 2:
            continue

        pos_info = fields[1].split(",")
        pos_major = pos_info[0]  # 名詞,動詞,形容詞など
        pos_sub1 = pos_info[1]   # 固有名詞,一般 などの細分類

        if pos_major == "名詞" and pos_sub1 == "固有名詞":
            review_keywords_list['proper_nouns'].append(fields[0])
        elif pos_major == "名詞" and pos_sub1 == "数":
            review_keywords_list['numbers'].append(fields[0])

    return review_keywords_list


In [None]:
# Example usage
review_document = '2024.09.12 アップデート後、動画が凄くスムーズに再生されるので凄く感動しています、それに貯蓄ポイントに利息が付いて来るなんて、ポイントを貯める事に熱が入るのは間違いない。今後も使いやすいアプリ目指して頑張ってください。'
keywords = extract_effective_keywords(review_document, ochasen_tagger)
print(keywords)

In [None]:
# データのpositive_review, negative_reviewに対して形態素解析を実行，結果を新たなカラムとして追加

df['negative_review_concreteness'] = df['negative_review'].apply(lambda x: extract_effective_keywords(x, ochasen_tagger))


# negative_review_concretenessに含まれる個数をカウント

df['negative_review_concreteness_count'] = df['negative_review_concreteness'].apply(lambda x: len(x['proper_nouns']) + len(x['numbers']))

# 正規化

qt = QuantileTransformer(output_distribution='uniform', random_state=0)
df['negative_review_concreteness_quantile_normalized'] = qt.fit_transform(df[['negative_review_concreteness_count']])
df.head()

In [59]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from bertopic import BERTopic

# モデルのロード
topic_model = BERTopic.load('models/android_topic_model')

# 特定のキーワードリスト
keywords = ["位置情報", "許可"]

# SentenceTransformerのembedding_modelにアクセスして埋め込みを生成
keyword_embeddings = topic_model.embedding_model.embedding_model.encode(keywords)

# 各negative reviewのembeddingを取得
negative_review_embeddings = topic_model.embedding_model.embedding_model.encode(df['negative_review'])

# dfに結果を格納
df['negative_review_similarity'] = cosine_similarity(negative_review_embeddings, keyword_embeddings).max(axis=1)

In [None]:
# negative_reviewが"-"だけのものは，negative_review_similarityを0にする
df.loc[df['negative_review'] == "-", 'negative_review_similarity'] = 0

# 正規化

qt = QuantileTransformer(output_distribution='uniform', random_state=0)
df['negative_review_similarity_quantile_normalized'] = qt.fit_transform(df[['negative_review_similarity']])

# negative_review_similarity_quantile_normalizedが高い順に表示
df.sort_values('negative_review_similarity_quantile_normalized', ascending=False).head()

# 結果を確認
df.head()

In [None]:
# topic_probability, engagement_quantile_normalized, review_length_quantile_normalized, negative_review_concreteness_quantile_normalized, negative_review_similarity_quantile_normalizedを重み付けして，代表レビュースコアを計算

# 重み付け
topic_weight = 0.02
engagement_weight = 0.05
review_length_weight = 0.02
negative_review_concreteness_weight = 0.01
negative_review_similarity_weight = 0.9

# 重み付けしたスコアを計算
df['representative_review_score'] = (df['topic_probability'] * topic_weight) + (df['engagement_quantile_normalized'] * engagement_weight) + (df['review_length_quantile_normalized'] * review_length_weight) + (df['negative_review_concreteness_quantile_normalized'] * negative_review_concreteness_weight)

# 正規化
qt = QuantileTransformer(output_distribution='uniform', random_state=0)
df['representative_review_score_quantile_normalized'] = qt.fit_transform(df[['representative_review_score']])
df.head()


In [None]:
similar_topics,similarity=topic_model.find_topics("許可",top_n=50)
topic_model.get_topic(similar_topics[2])

In [None]:
# 各トピックの中で，thumbsupが最も多いレビューを取得

# トピックごとにthumbsUpが最も多いレビューを取得
most_liked_reviews = df.loc[df.groupby('topic')['thumbsUp'].idxmax()]

# 結果を確認
most_liked_reviews.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Topic 11 に該当するデータをフィルタリング
topic_id = 11
filtered_df = df[df['topic'] == topic_id].copy()

# クエリ "位置情報" をベクトル化
query = "位置情報"
query_vector = topic_model.vectorizer_model.transform([query])

# 類似度を計算して追加する関数
def calculate_similarity(review):
    # レビューが空の場合は類似度を 0 とする
    if not review or len(review.strip()) == 0:
        return 0
    
    # レビューをベクトル化
    review_vector = topic_model.vectorizer_model.transform([review])
    
    # クエリとのコサイン類似度を計算
    similarity = cosine_similarity(query_vector, review_vector)
    return similarity[0, 0]

# 各 text に対して類似度を計算
filtered_df['similarity'] = filtered_df['text'].apply(calculate_similarity)

# 類似度スコアで上位n件を取得
n = 5
top_n_reviews = filtered_df.nlargest(n, 'similarity')

# 上位5件の id と text を出力
for idx, row in top_n_reviews.iterrows():
    print(f"ID: {row['id']}")
    print(f"レビュー: {row['text']}")
    print("-" * 50)

In [None]:
# トピックごとに thumbsUp と topic probability と review length の積を計算
df['score'] = df['engagement_quantile_normalized'] + df['topic_probability'] + df['review_length_quantile_normalized']

# トピックごとに score が最も大きいレビューを取得
most_liked_reviews = df.loc[df.groupby('topic')['score'].idxmax()]

# 結果を確認

most_liked_reviews.head()