In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(keywords, description):
    """
    Calculate the cosine similarity between a list of keywords and a product description using TF-IDF.

    Args:
    keywords (list of str): The list of keywords or phrases to compare.
    description (str): The full product description text.


    Returns:
    list of float: The list of cosine similarity scores for each keyword.
    """
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Combine the list of keywords with the description into a single list where
    # the description is the last element
    texts = keywords + [description]
    texts = keywords + [description]
    
    # Fit and transform the texts
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Calculate cosine similarity between each keyword vector and the description vector
    similarity_scores = cosine_similarity(tfidf_matrix[:-1], tfidf_matrix[-1:])

    # Flatten the array of scores and return it as a list
    return similarity_scores.flatten().tolist()


In [2]:
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine

def get_bert_embedding(model, tokenizer, text):
    """
    Generate BERT embedding for the given text.
    """
    # Encode text to get input IDs and attention mask
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    # Generate outputs using the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings for the [CLS] token (used as the sentence representation)
    return outputs.last_hidden_state[:, 0, :].squeeze()

def calculate_bert_similarity(keywords, description, model_name='bert-base-uncased'):
    """
    Calculate the cosine similarity between a list of keywords and a product description using BERT embeddings.
    """
    # Load the BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Get embedding for the description
    desc_embedding = get_bert_embedding(model, tokenizer, description)

    # Calculate similarity scores
    scores = []
    for keyword in keywords:
        kw_embedding = get_bert_embedding(model, tokenizer, keyword)
        # Calculate cosine similarity (1 - cosine distance)
        similarity = 1 - cosine(desc_embedding.numpy(), kw_embedding.numpy())
        scores.append(similarity)
    
    return scores


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Example usage
keywords = ["AI tool", "neural network", "deep learning", "Sony AI", "CNN", "AI risk management", "Next-generation AI Technology", "Reinforcement Learning", "Research AI", "AI Courses"]


string_list = [
    "ソニー α",
    "ソニー ミラーレスカメラ",
    "フルサイズミラーレスカメラ",
    "APS-Cミラーレスカメラ",
    "交換レンズカメラ",
    "デジタルカメラ",
    "写真撮影",
    "動画撮影",
    "ソニー α 初心者",
    "ソニー α 入門",
    "ソニー α おすすめ",
    "ソニー α エンジニア",
    "ソニー α プロ",
    "ソニー α ビデオカメラ",
    "ソニー α 動画撮影",
    "ソニー α 7R IV",
    "ソニー α 7S III",
    "ソニー α 1",
    "ソニー α 6000",
    "ソニー α 6500",
    "ソニー α オートフォーカス",
    "ソニー α 手ブレ補正",
    "ソニー α アイオートフォーカス",
    "ソニー α 4K動画",
    "ソニー α フルサイズセンサー",
    "ソニー α APS-Cセンサー",
    "ソニー α スローモーション",
    "ソニー α タイムラプス",
    "ソニー α ポートレート撮影",
    "ソニー α 風景撮影",
    "ソニー α 野生動物撮影",
    "ソニー α スポーツ撮影",
    "ソニー α レンズ互換性",
    "ソニー α アクセサリー",
    "ソニー α レビュー",
    "ソニー α 比較",
    "ソニー α vs. キヤノンEOS",
    "ソニー α vs. ニコンZ",
    "ソニー α カメラキット",
    "ソニー α レンズセット",
    "ソニー α 中古カメラ",
    "ソニー α カメラレンタル",
    "ソニー α カメラバッグ",
    "ソニー α カメラストラップ",
    "ソニー α カメラクリーニング",
    "ソニー α カメラ修理",
    "ソニー α カメラスクール",
    "ソニー α フォトグラファー",
    "ソニー α ビデオグラファー",
    "ソニー α 撮影テクニック"
]


description = """
Neural Network Console: Easily design neural networks with drag-and-drop. Achieve advanced AI development using deep learning without coding. Edit easily with drag-and-drop. Sony has trained 2,000 AI talents. Automate tedious tuning tasks. Try it for free.
Over 60,000 registrants for the cloud version. ‘Utilize rich resources on the cloud to use deep learning anytime, anywhere. 
If you are considering commercial use or use on various operating systems, please check here.’ About the cloud version: ’For those who are about to engage in AI development using Deep Learning, and those who are already involved, do you have these concerns? 
You need to learn Python and mathematical formulas. Changing network structures and parameters through coding can be time-consuming due to trial and error. Managing dozens of learned neural networks can be challenging. Want to automate neural network tuning. Setting up the development environment is costly and time-consuming.」
"""

similarity_scores = calculate_similarity(keywords, description)
for keyword, score in zip(keywords, similarity_scores):
    print(f"The cosine similarity score for '{keyword}' is: {score}")

The cosine similarity score for 'AI tool' is: 0.038207751426139164
The cosine similarity score for 'neural network' is: 0.23802787871164258
The cosine similarity score for 'deep learning' is: 0.19211976939629705
The cosine similarity score for 'Sony AI' is: 0.08455740067240541
The cosine similarity score for 'CNN' is: 0.0
The cosine similarity score for 'AI risk management' is: 0.02849578713966412
The cosine similarity score for 'Next-generation AI Technology' is: 0.023715506902070187
The cosine similarity score for 'Reinforcement Learning' is: 0.07623606131407872
The cosine similarity score for 'Research AI' is: 0.038207751426139164
The cosine similarity score for 'AI Courses' is: 0.038207751426139164


In [1]:
from bert_score import score
from transformers import AutoTokenizer, AutoModel

# Load a pre-trained BERT model
model_name = "bert-base-multilingual-cased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

from transformers import logging
logging.set_verbosity_error()

# Keywords and sentences
# keywords = ["ソニー医療保険", "安い医療保険", "病気保険", "先進医療保険", "手術保険", "終身医療保険", "保険金請求", "手頃な保険料", "保険料割引", "インターネット割引", "自動車保険", "火災保険", "保険質問", "保障内容", "手術給付", "入院保障", "ソニー損保", "公式ホームページ"]
keywords = [
    "ソニー ブラビア",
    "ソニー テレビ",
    "4Kテレビ",
    "スマートテレビ",
    "Android TV",
    "8Kテレビ",
    "OLEDテレビ",
    "HDRテレビ",
    "フルHDテレビ",
    "大画面テレビ",
    "薄型テレビ",
    "高画質テレビ",
    "家電 テレビ",
    "テレビ 音質",
    "ソニー 家電",
    "ブラビア 価格",
    "ソニー ブラビア 評価",
    "テレビ おすすめ",
    "最新モデル テレビ",
    "テレビ 比較",
    "ブラビア 買い替え",
    "テレビ サイズ",
    "ブラビア レビュー",
    "テレビ セール",
    "ソニー 画質",
    "ソニー 音質",
    "ソニー 映像技術",
    "テレビ リモコン",
    "音響システム テレビ",
    "テレビ 映像美",
    "ソニー ストリーミング",
    "ソニー 映像プロセッサー",
    "テレビ クーポン",
    "ブラビア 高評価",
    "省エネ テレビ",
    "ソニー 動画視聴",
    "ソニー 映像美",
    "ソニー 4K対応",
    "8K対応テレビ",
    "ソニー 省エネ",
    "ソニー HDR10",
    "ブラビア 有機EL",
    "ソニー 映像エンジン",
    "テレビ 新製品",
    "ソニー インテリア",
    "スマートホーム テレビ",
    "ソニー ゲームモード",
    "ソニー 映画鑑賞",
    "テレビ 周辺機器",
    "ソニー テレビ ランキング"
]
sentences = [
    'ソニー テレビ ブラビア公式ウェブサイト。4Kテレビを始めとしたラインアップや、高画質・高音質などのブラビアがもつ魅力、サポート情報を紹介.', 
    'ソニー テレビ ブラビア公式ウェブサイト。4Kテレビを始めとしたラインアップや、高画質・高音質などのブラビアがもつ魅力、サポート情報を紹介.', 
    'BRAVIA の製品一覧 · 画面サイズ：55V型(インチ) · 種類：4K液晶テレビ · 画素数：3840x2160 · HDMI端子：4端子 · 年間電気代：3780円 · 1V型(インチ)あたりの価格：¥2,514.', 
    '従来モデル「A80Lシリーズ」に比べ、ピーク輝度が1.2倍向上し更に高コントラストに進化。さらに薄型化。新しくトゥイーターを左右に搭載し、よりクリアで臨場感ある ...', 
    '326,970円. 32,697ポイント. 送料無料、8月26日（月）お届け. 中古品1点 159,980円～. お安くなりました. 下取り5,000円引き. 比較する. 液晶テレビ BRAVIA(ブラビア) ...', 
    'BRAVIA（ブラビア）シリーズを展開するソニーのテレビは、発色が良く、鮮やか画質を好まれる方におすすめ。4KモデルはOSがAndroidであるため、 ...', 
    'ソニー(SONY) 43V型 4K X80WKシリーズ 液晶 テレビ ブラビア KJ-43X80WK Google TV Dolby Atmos対応 4.5畳以上推奨 2022年モデル · 4.45つ星のうち4.4 (349).', 
    'ソニー ブラビア 有機ELテレビ フラッグシップモデル. ○4Kブラビア史上最高の明るさによる高コントラスト。 ○独自の新技術により圧倒的映像美を実現した、Mini LED搭載の ...', 
    '楽天市場-「ブラビア ソニー」8971件 人気の商品を価格比較・ランキング･レビュー・口コミで検討できます。ご購入でポイント取得がお得。セール商品・送料無料商品も ...'
    # add other sentences
]

# Calculate BERT Score
for keyword in keywords:
    print(f"Scores for keyword: {keyword}")
    for sentence in sentences:
        P, R, F1 = score([keyword], [sentence], model_type=model_name, num_layers=1)
        print(f" - Sentence: {sentence[:30]}... F1 Score: {F1.item():.4f}")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Scores for keyword: ソニー ブラビア
 - Sentence: ソニー テレビ ブラビア公式ウェブサイト。4Kテレビを始めと... F1 Score: 0.4453
 - Sentence: ソニー テレビ ブラビア公式ウェブサイト。4Kテレビを始めと... F1 Score: 0.4453
 - Sentence: BRAVIA の製品一覧 · 画面サイズ：55V型(インチ)... F1 Score: 0.2287
 - Sentence: 従来モデル「A80Lシリーズ」に比べ、ピーク輝度が1.2倍向... F1 Score: 0.2405
 - Sentence: 326,970円. 32,697ポイント. 送料無料、8月2... F1 Score: 0.3335
 - Sentence: BRAVIA（ブラビア）シリーズを展開するソニーのテレビは、... F1 Score: 0.3863
 - Sentence: ソニー(SONY) 43V型 4K X80WKシリーズ 液晶... F1 Score: 0.3895
 - Sentence: ソニー ブラビア 有機ELテレビ フラッグシップモデル. ○... F1 Score: 0.4123
 - Sentence: 楽天市場-「ブラビア ソニー」8971件 人気の商品を価格比... F1 Score: 0.3782
Scores for keyword: ソニー テレビ
 - Sentence: ソニー テレビ ブラビア公式ウェブサイト。4Kテレビを始めと... F1 Score: 0.4012
 - Sentence: ソニー テレビ ブラビア公式ウェブサイト。4Kテレビを始めと... F1 Score: 0.4012
 - Sentence: BRAVIA の製品一覧 · 画面サイズ：55V型(インチ)... F1 Score: 0.2806
 - Sentence: 従来モデル「A80Lシリーズ」に比べ、ピーク輝度が1.2倍向... F1 Score: 0.2388
 - Sentence: 326,970円. 32,697ポイント. 送料無料、8月2... F1 Score: 0.3025
 - Sentence: BRAVIA（ブラビア）シリーズを展開するソニーのテレビは、... 

In [7]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import BERTScorer
import MeCab

def evaluate_keywords_against_sentences2(sentences, keywords, lang="ja"):
    # Initialize MeCab Tokenizer
    tokenizer = MeCab.Tagger("-Owakati")
    
    # Initialize lists to store scores for each sentence
    bleu4_scores = []
    rouge1_scores = []
    bert_f1_scores = []

    # Initialize BERTScorer once for efficiency
    scorer = BERTScorer(lang=lang, model_type="bert-base-multilingual-cased", rescale_with_baseline=False)
    
    for sentence in sentences:
        # Tokenize the sentence and keywords
        tokenized_sentence = tokenizer.parse(sentence).strip().split()
        tokenized_keywords = tokenizer.parse(' '.join(keywords)).strip().split()

        # Calculate BLEU-4 score
        bleu4 = sentence_bleu([tokenized_sentence], tokenized_keywords, smoothing_function=SmoothingFunction().method1)
        bleu4_scores.append(bleu4)

        # Calculate ROUGE-1 score
        rouge = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
        rouge1 = rouge.score(' '.join(tokenized_sentence), ' '.join(tokenized_keywords))['rouge1'].fmeasure
        rouge1_scores.append(rouge1)

        # Calculate BERTScore
        keyword_str = ' '.join(keywords)
        P, R, F1 = scorer.score([keyword_str], [sentence])
        bert_f1_scores.append(F1.item())

    # Return the average scores or the scores for each sentence
    avg_bleu4 = sum(bleu4_scores) / len(bleu4_scores)
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    
    return avg_bleu4, avg_rouge1, avg_bert_f1

In [3]:
from bert_score import score
from transformers import AutoTokenizer, AutoModel

from transformers import logging
logging.set_verbosity_error()

# Load a pre-trained BERT model
model_name = "bert-base-multilingual-cased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Calculate BERT Score and average
total_f1_score_across_keywords = 0
for keyword in keywords:
    total_f1_score = 0
    for sentence in sentences:
        P, R, F1 = score([keyword], [sentence], model_type=model_name, num_layers=3)
        total_f1_score += F1.item()
    average_f1_score = total_f1_score / len(sentences)
    total_f1_score_across_keywords += average_f1_score

overall_average_f1_score = total_f1_score_across_keywords / len(keywords)
print(f"Overall Average F1 Score Across All Keywords: {overall_average_f1_score:.4f}")

Overall Average F1 Score Across All Keywords: 0.4161


In [8]:
bleu4, rouge1, bert_f1 = evaluate_keywords_against_sentences2(sentences, keywords)
print(f"BLEU-4: {bleu4}, ROUGE-1: {rouge1}, BERT-F1: {bert_f1}")
print(f"BLEU-4 Score: {bleu4:.4f}")
print(f"ROUGE-1 Score: {rouge1:.4f}")
print(f"BERTScore (F1): {bert_f1:.4f}")

BLEU-4: 0.009475339743422483, ROUGE-1: 0.1639968806242082, BERT-F1: 0.5869871841536628
BLEU-4 Score: 0.0095
ROUGE-1 Score: 0.1640
BERTScore (F1): 0.5870


In [11]:
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased")
# Initialize the BERTScorer for multilingual BERT or a Japanese-specific BERT
scorer = BERTScorer(model_type="bert-base-multilingual-cased", lang="ja", device='cuda' if torch.cuda.is_available() else 'cpu')

df_score = pd.read_csv('/home/ubuntu/reflexion/New_LLM_Agent_4_Ad_Keyword_and_Text/preprocessing/data/score_data/rakkokeyword_bravia.csv', delimiter='\t', quotechar='"', encoding='utf-16')

def jaccard_similarity(str1, str2):
    # Initialize MeCab Tokenizer
    #tokenizer = MeCab.Tagger("-Owakati")
    
    # Tokenize the strings
    tokens_a = set(tokenizer.tokenize(str1))
    tokens_b = set(tokenizer.tokenize(str2))
    
    # Calculate Jaccard Similarity
    intersection = tokens_a.intersection(tokens_b)
    union = tokens_a.union(tokens_b)
    
    return float(len(intersection) / len(union)) if len(union) != 0 else 0.0
    

def cosine_similarity_calc(str1, str2):
    # Initialize MeCab Tokenizer
    #tokenizer = MeCab.Tagger("-Owakati")
    
    # Tokenize and encode the strings
    inputs_1 = tokenizer(str1, return_tensors='pt', truncation=True, padding=True)
    inputs_2 = tokenizer(str2, return_tensors='pt', truncation=True, padding=True)
    
    # Get embeddings from BERT model
    with torch.no_grad():
        outputs_1 = bert_model(**inputs_1)
        outputs_2 = bert_model(**inputs_2)
    
    # Get the embeddings for the [CLS] token
    # This represents the pooled output for each sentence
    embedding_1 = outputs_1.last_hidden_state[:, 0, :].squeeze().numpy()
    embedding_2 = outputs_2.last_hidden_state[:, 0, :].squeeze().numpy()

    # Reshape the embeddings to 2D arrays (1, -1) to ensure proper input to cosine_similarity
    embedding_1 = embedding_1.reshape(1, -1)
    embedding_2 = embedding_2.reshape(1, -1)
    
    # Calculate Cosine similarity
    cos_sim = cosine_similarity(embedding_1, embedding_2)
    return cos_sim[0, 0]

def find_most_relevant_keywords(keyword_list, dataframe, keyword_column, traffic_column):
    results = {}
    for keyword in keyword_list:
        max_score = -1
        most_relevant = ""
        estimated_traffic = None
        best_jaccard = 0
        best_cosine = 0

        for idx, entry in enumerate(dataframe[keyword_column]):
            # Calculate BERTScore
            P, _, _ = scorer.score([keyword], [entry])
            # Calculate Jaccard Similarity
            jaccard = jaccard_similarity(keyword, entry)
            # Calculate Cosine Similarity
            cosine_sim = cosine_similarity_calc(keyword, entry)
            

            if P[0] > max_score:
                max_score = P[0]
                most_relevant = entry
                estimated_traffic = dataframe[traffic_column].iloc[idx]
                best_jaccard = jaccard
                best_cosine = cosine_sim

        results[keyword] = {
            'Most Relevant Keyword': most_relevant,
            'BERTScore': max_score.item(),  # Convert tensor to float
            'Estimated Traffic': estimated_traffic,
            'Cosine Similarity': best_cosine,
            'Jaccard Similarity': best_jaccard,
        }
    return results

def update_clicks(df, kw_dict, traffic_column ):
    # if column 'Jacard' not exist, create it
    if 'Jacard' not in df.columns:
        df['Jacard'] = 0
    # if column 'Cosine' not exist, create it
    if 'Cosine' not in df.columns:
        df['Cosine'] = 0
    # if column 'BERT' not exist, create it
    if 'BERT' not in df.columns:
        df['BERT'] = 0

    
    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        keyword = row['Keyword']
        # Check if the keyword exists in the dictionary
        if keyword in kw_dict:
            # Update the 'Clicks' column with the 'Estimated Traffic' from the dictionary
            df.at[index, 'Clicks'] = kw_dict[keyword][traffic_column]
            df.at[index, 'Jacard'] = kw_dict[keyword]['Jaccard Similarity']
            df.at[index, 'Cosine'] = kw_dict[keyword]['Cosine Similarity']
            df.at[index, 'BERT'] = kw_dict[keyword]['BERTScore']

            
    return df

results = find_most_relevant_keywords(keywords, df_score, 'キーワード', '推定流入数')

results

{'ソニー ブラビア': {'Most Relevant Keyword': 'ソニー ブラビア',
  'BERTScore': 1.0,
  'Estimated Traffic': 4341,
  'Cosine Similarity': 0.99999994,
  'Jaccard Similarity': 1.0},
 'ソニー テレビ': {'Most Relevant Keyword': 'ソニーテレビ',
  'BERTScore': 0.9512385129928589,
  'Estimated Traffic': 11883,
  'Cosine Similarity': 0.9720487,
  'Jaccard Similarity': 0.5},
 '4Kテレビ': {'Most Relevant Keyword': '4kテレビ',
  'BERTScore': 0.9526677131652832,
  'Estimated Traffic': 3924,
  'Cosine Similarity': 0.9784382,
  'Jaccard Similarity': 0.5},
 'スマートテレビ': {'Most Relevant Keyword': 'ソニー スマートテレビ',
  'BERTScore': 0.9622127413749695,
  'Estimated Traffic': 57,
  'Cosine Similarity': 0.9587776,
  'Jaccard Similarity': 0.6666666666666666},
 'Android TV': {'Most Relevant Keyword': '4k tv',
  'BERTScore': 0.8391130566596985,
  'Estimated Traffic': 124,
  'Cosine Similarity': 0.9228644,
  'Jaccard Similarity': 0.0},
 '8Kテレビ': {'Most Relevant Keyword': '4kテレビ',
  'BERTScore': 0.9331984519958496,
  'Estimated Traffic': 3924,
  'Co

In [12]:
def calculate_mean_similarity(similarity_dict):
    # Initialize accumulators for Jaccard and Cosine Similarity
    total_jaccard = 0
    total_cosine = 0
    total_clicks = 0
    count = 0

    # Iterate through each key in the dictionary
    for key, value in similarity_dict.items():
        # Accumulate the Jaccard and Cosine Similarity values
        total_jaccard += value['Jaccard Similarity']
        total_cosine += value['Cosine Similarity']
        total_clicks += value['Estimated Traffic']
        count += 1

    # Calculate the mean Jaccard and Cosine Similarity
    mean_jaccard = total_jaccard / count if count > 0 else 0
    mean_cosine = total_cosine / count if count > 0 else 0
    mean_clicks = total_clicks / count if count > 0 else 0

    return mean_jaccard, mean_cosine, mean_clicks

    # Calculate the mean similarities
mean_jaccard, mean_cosine, mean_click = calculate_mean_similarity(results)

# Output the results
print(f"Mean Jaccard Similarity: {mean_jaccard:.4f}")
print(f"Mean Cosine Similarity: {mean_cosine:.4f}")
print(f"Mean Estimated Traffic: {mean_click:.4f}")

Mean Jaccard Similarity: 0.3745
Mean Cosine Similarity: 0.9277
Mean Estimated Traffic: 833.3400
