In [1]:
import pandas as pd
import numpy as np
import csv
import MeCab
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import neologdn
import demoji
import emoji

# model_dir = '/Users/iomacbookair2/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_dir = '/Users/labimac/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_word2vec = KeyedVectors.load_word2vec_format(model_dir, binary=True)

In [2]:
def calc_similarity_word2vec(sentence1, sentence2, model):
  mecab = MeCab.Tagger(
      '-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd')

  pre_sentence1 = sentence1.split(" ")
  sentence1_words = []
  for s1 in pre_sentence1:
    sentence1_words += [line.split("\t")[0]
                        for line in mecab.parse(s1).split("\n")[:-2]]
  # print(f"sentence1_words: {sentence1_words}")

  pre_sentence2 = sentence2.split(" ")
  sentence2_words = []
  for s2 in pre_sentence2:
    sentence2_words += [line.split("\t")[0]
                        for line in mecab.parse(s2).split("\n")[:-2]]
  # print(f"sentence2_words: {sentence2_words}")

  if not sentence1_words and not sentence2_words:
    similarity = 1
  elif not sentence1_words or not sentence2_words:
    similarity = 0
  elif (len(sentence1_words) == 1 and len(sentence2_words) != 1) or (len(sentence2_words) == 1 and len(sentence1_words) != 1):
    similarity = 0
  elif all(re.match(r'.*[a-zA-Z].*', word) for word in sentence1_words) or all(re.match(r'.*[a-zA-Z].*', word) for word in sentence2_words):
    similarity = 0
  else:
    # Compute word embeddings for each sentence
    sentence1_embedding = np.mean([model[word]
                                  for word in sentence1_words if word in model], axis=0)
    # print(f"sentence1_embedding: {sentence1_embedding}")
    sentence2_embedding = np.mean([model[word]
                                  for word in sentence2_words if word in model], axis=0)
    # print(f"sentence2_embedding: {sentence2_embedding}")
    if np.isnan(sentence1_embedding).any() or np.isnan(sentence2_embedding).any():
        similarity = 0
    else:
      similarity = cosine_similarity(
          [sentence1_embedding], [sentence2_embedding])[0][0]

  # print(f"similarity: {similarity}")

  return similarity


def calc_average_similarity_word2vec(sentences, key):
    similarities = []
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = calc_similarity_word2vec(
                sentences[i], sentences[j], model_word2vec)
            similarities.append(similarity)

    q = [0, 0.25, 0.5, 0.75, 1]
    outputs = {}
    for i in range(len(q)):
        outputs[f"quantile_{i}"] = np.quantile(similarities, q[i])
    outputs["average_similarity"] = np.mean(similarities)
    outputs["size"] = len(list(np.sort(np.array(similarities))))
    outputs["width_q"] = outputs["quantile_3"] - outputs["quantile_1"]
    outputs["standard_deviation"] = np.std(similarities)

    if key == "quantile_3":
        return outputs["quantile_3"]
    elif key == "average":
        return outputs["average_similarity"]
    elif key == "size":
        return outputs["size"]
    elif key == "width":
        return outputs["width_q"]
    elif key == "standard_deviation":
        return outputs["standard_deviation"]


In [3]:
# path = "/Users/iomacbookair2/Documents/lab/DEIM2023/tweet_csv/221214_ann_wed.csv"
path = "/Users/labimac/Documents/lab/DEIM2023/tweet_csv/221214_ann_wed.csv"
df = pd.read_csv((path))
df.sort_values(by = 'created_at', ascending = True, inplace = True)
df = df.reset_index(drop=True)
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.drop("author_id", axis=1)
df = df.drop("username", axis=1)

df.to_csv("sorted.csv", index=False)
df

Unnamed: 0,created_at,text
0,2022-12-15 01:00:00,#乃木坂46ANN
1,2022-12-15 01:00:00,#乃木坂46ANN
2,2022-12-15 01:00:00,#乃木坂46ANN
3,2022-12-15 01:00:00,#乃木坂46ANN
4,2022-12-15 01:00:00,#乃木坂46ANN
...,...,...
13960,2022-12-15 03:03:55,おごりのカッコいい払い方って難しいけど、だからこそかっこいいのかな？ ちゃんとスポーツ観戦し...
13961,2022-12-15 03:03:56,お疲れ様でした #乃木坂46ANN
13962,2022-12-15 03:04:11,しおりさん、スタッフさんオツカレサマデシタ！！ 久々の1人喋りスポーツの話満載でしたが色んな...
13963,2022-12-15 03:04:13,12/14㈬#あちこちオードリー #乃木坂46ANN #SPYのボスの正体は佐久間宣行 #f...


In [4]:
def preprocess(text):
    text = emoji.replace_emoji(text, replace=' ')
    text = neologdn.normalize(text)
    text = re.sub(r'#\S+', '', text) # ハッシュタグをスペースに置き換え
    text = re.sub(r'＃\S+', '', text) # ハッシュタグをスペースに置き換え
    text = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text) # URLをスペースに置き換え
    text = re.sub(r'https?://\S+', ' ', text) # URLをスペースに置き換え
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text) # 記号をスペースに置き換え
    text = re.sub(
        u'[■-♯【】「」『』・ㅂﾟˊᗜ、。∀〇╰ˋω…╭´｀•˘д↑艸╯→°д̀ᴗ˃˂⁽⁾φl└＼※彡𖥦←ꂹ]', ' ', text) # 記号をスペースに置き換え
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text) # 小数点とカンマを消す
    text = re.sub(r'\d+', '0', text) # 数字を0に置き換え
    text = text.lower() # 英字を小文字に
    text = re.sub(r"[\u3000\t\r\n]", " ", text) # 空白文字をスペースに置き換え
    return text


df['text'] = df['text'].apply(preprocess)
df
df.to_csv('filtered3.csv', index=False)


In [5]:
groups = df.groupby(pd.Grouper(key='created_at', freq='min'))
df_texts_by_minute = pd.DataFrame({
    "texts_by_minute": groups.apply(lambda x: x["text"].tolist())
})
df_texts_by_minute = df_texts_by_minute.reset_index()
df_texts_by_minute
# df_texts_by_minute.to_csv("texts_by_minute.csv", index=False)

Unnamed: 0,created_at,texts_by_minute
0,2022-12-15 01:00:00,"[, , , , , , , , , , , , , さあ 今夜もお願いします お願いします..."
1,2022-12-15 01:01:00,"[ , , , , , , , , , こんなんなんぼあってもいいですからね, , , , ..."
2,2022-12-15 01:02:00,"[, 森高千里さんから可愛いをいただいた久保ちゃん 笑 可愛いなんてなんぼ言われてもいいよね..."
3,2022-12-15 01:03:00,"[やかましいわ, 久保ちゃん それ聞いちゃおしまいよ, , クリスマスこのままだとボッチ ,..."
4,2022-12-15 01:04:00,"[あの森高千里さんから かわいい 頂きましたw , , 森高千里さん美しいよね, クリパでも..."
...,...,...
120,2022-12-15 03:00:00,[今週もお疲れ様でした かっこつけようとして 失敗してしまったのがくぼちゃんらしいです笑次回...
121,2022-12-15 03:01:00,[ 乃木坂0のオールナイトニッポン 卒業を発表している齋藤飛鳥の生出演が決定 パーソナリテ...
122,2022-12-15 03:02:00,"[またねー, しおりんお疲れ様でした 来週も絶対聴く またね ノ, 今日も安定の爆笑..."
123,2022-12-15 03:03:00,"[敗者復活でやるような場所じゃない, 久保ちゃんお疲れ様でしたソロはひさびさでしたね来週はと..."


In [6]:
df_texts_by_minute["median"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "quantile_3"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [7]:
df_texts_by_minute["average"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "average"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [8]:
df_texts_by_minute["size"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "size"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [9]:
df_texts_by_minute["width"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "width"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [10]:
df_texts_by_minute["standard_deviation"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "standard_deviation"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [11]:
df_texts_by_minute

Unnamed: 0,created_at,texts_by_minute,median,average,size,width,standard_deviation
0,2022-12-15 01:00:00,"[, , , , , , , , , , , , , さあ 今夜もお願いします お願いします...",1.000000,0.887551,222778,0.000000,0.315150
1,2022-12-15 01:01:00,"[ , , , , , , , , , こんなんなんぼあってもいいですからね, , , , ...",1.000000,0.476690,20706,1.000000,0.473239
2,2022-12-15 01:02:00,"[, 森高千里さんから可愛いをいただいた久保ちゃん 笑 可愛いなんてなんぼ言われてもいいよね...",0.671236,0.354342,14535,0.671236,0.367582
3,2022-12-15 01:03:00,"[やかましいわ, 久保ちゃん それ聞いちゃおしまいよ, , クリスマスこのままだとボッチ ,...",0.597014,0.336402,12720,0.597014,0.305982
4,2022-12-15 01:04:00,"[あの森高千里さんから かわいい 頂きましたw , , 森高千里さん美しいよね, クリパでも...",0.603410,0.354516,12246,0.603410,0.302588
...,...,...,...,...,...,...,...
120,2022-12-15 03:00:00,[今週もお疲れ様でした かっこつけようとして 失敗してしまったのがくぼちゃんらしいです笑次回...,0.243878,0.160540,12720,0.243878,0.288074
121,2022-12-15 03:01:00,[ 乃木坂0のオールナイトニッポン 卒業を発表している齋藤飛鳥の生出演が決定 パーソナリテ...,0.510457,0.225801,3003,0.510457,0.313578
122,2022-12-15 03:02:00,"[またねー, しおりんお疲れ様でした 来週も絶対聴く またね ノ, 今日も安定の爆笑...",0.713293,0.316734,465,0.713293,0.353925
123,2022-12-15 03:03:00,"[敗者復活でやるような場所じゃない, 久保ちゃんお疲れ様でしたソロはひさびさでしたね来週はと...",0.749243,0.522638,36,0.346227,0.304620


In [12]:
df_texts_by_minute.to_csv('cosine_similarity_w2v_221214_ann_wed.csv', index=False)