In [1]:
import pandas as pd
import numpy as np
import csv
import MeCab
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import neologdn
import demoji
import emoji

model_dir = '/Users/iomacbookair2/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_word2vec = KeyedVectors.load_word2vec_format(model_dir, binary=True)

In [2]:
def calc_similarity_word2vec(sentence1, sentence2, model):
  mecab = MeCab.Tagger()

  pre_sentence1 = sentence1.split(" ")
  sentence1_words = []
  for s1 in pre_sentence1:
    sentence1_words += [line.split("\t")[0]
                        for line in mecab.parse(s1).split("\n")[:-2]]
  print(f"sentence1_words: {sentence1_words}")

  pre_sentence2 = sentence2.split(" ")
  sentence2_words = []
  for s2 in pre_sentence2:
    sentence2_words += [line.split("\t")[0]
                        for line in mecab.parse(s2).split("\n")[:-2]]
  print(f"sentence2_words: {sentence2_words}")

  # Compute word embeddings for each sentence
  sentence1_embedding = np.mean([model[word]
                                for word in sentence1_words], axis=0)
  sentence2_embedding = np.mean([model[word]
                                for word in sentence2_words], axis=0)
  similarity = cosine_similarity(
      [sentence1_embedding], [sentence2_embedding])[0][0]

  return similarity


def calc_average_similarity_word2vec(sentences):
    similarities = []
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = calc_similarity_word2vec(
                sentences[i], sentences[j], model_word2vec)
            similarities.append(similarity)

    q = [0, 0.25, 0.5, 0.75, 1]
    quartiles = {}
    for i in range(len(q)):
        quartiles[f"quantile_{i}"] = np.quantile(similarities, q[i])
    quartiles["average_similarity"] = np.mean(similarities)
    size = len(list(np.sort(np.array(similarities))))
    width_q = quartiles["quantile_3"] - quartiles["quantile_1"]

    return [quartiles, width_q, size]


In [3]:
path = "/Users/iomacbookair2/Documents/lab/DEIM2023/tweet_csv/221212_ann_mon.csv"
df = pd.read_csv((path))
df.sort_values(by = 'created_at', ascending = True, inplace = True)
df = df.reset_index(drop=True)
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.drop("author_id", axis=1)
df = df.drop("username", axis=1)

df.to_csv("sorted.csv", index=False)
df

Unnamed: 0,created_at,text
0,2022-12-13 01:00:00,#cnann
1,2022-12-13 01:00:00,#cnann
2,2022-12-13 01:00:00,#cnann
3,2022-12-13 01:00:00,#cnann
4,2022-12-13 01:00:00,#cnann
...,...,...
3193,2022-12-13 03:03:53,アルニキ、松ニキお疲れ様。 #cnann
3194,2022-12-13 03:03:53,来週、そこには劣等感が火を吹いて燃え尽きたDJニキが… #cnann
3195,2022-12-13 03:03:54,IOさんとKEIJUさんの呼び方はどうなるのか…。 そしてそのまま呼べるのか…。 お疲れ様で...
3196,2022-12-13 03:04:03,今日も楽しかったです😊 お疲れ様でした！ また来週〜👋 #cnann


In [4]:
def preprocess(text):
    text = emoji.replace_emoji(text, replace=' ')
    text = neologdn.normalize(text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'＃\S+', '', text)
    text = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text)
    text = re.sub(r'https?://\S+', ' ', text)
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text)
    text = re.sub(u'[■-♯【】「」『』・ㅂﾟˊᗜ、。∀〇ˋω…╭´｀•˘д←ꂹ╯]', ' ', text)
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text)
    text = re.sub(r'\d+', '0', text)
    text = text.lower()
    text = re.sub(r"[\u3000\t\r\n]", " ", text)
    return text


df['text'] = df['text'].apply(preprocess)
df
df.to_csv('filtered3.csv', index=False)


In [5]:
groups = df.groupby(pd.Grouper(key='created_at', freq='min'))
df_texts_by_minute = pd.DataFrame({
    "texts_by_minute": groups.apply(lambda x: x["text"].tolist())
})
df_texts_by_minute = df_texts_by_minute.reset_index()
df_texts_by_minute
# df_texts_by_minute.to_csv("texts_by_minute.csv", index=False)

Unnamed: 0,created_at,texts_by_minute
0,2022-12-13 01:00:00,"[, , , , , , , , , , , , , , , , , , , , creep..."
1,2022-12-13 01:01:00,"[, 今年の漢字は 戰 , , , , 今年の漢字 戦 , , , , , , , 戦, ,..."
2,2022-12-13 01:02:00,"[, , , , , 韻は無敵すぎw , 今年の漢字 戦 , 今年の漢字 韻 でしのぐ笑,..."
3,2022-12-13 01:03:00,"[今年の漢字やっぱりきた , 今年の漢字 0人はずっと皿と韻, 繋はオフィシャル過ぎる, ..."
4,2022-12-13 01:04:00,"[真摯に向き合った白紙, 韻 皿 擦 繋 , 今年の漢字rさん 韻 松永さん 皿 擦 繋..."
...,...,...
120,2022-12-13 03:00:00,"[お疲れ様でした来週のspw楽しみにしてます , お疲れっしたーわちゃわちゃ良かったよ,..."
121,2022-12-13 03:01:00,"[0人だけ回お疲れ様でした , お疲れ様でした, お疲れ様でした おやすみなさい , お..."
122,2022-12-13 03:02:00,"[お疲れ様でした, , r 指定と 俺 お疲れさまでしたー, おれお疲れ様でした , , ..."
123,2022-12-13 03:03:00,"[超楽しかった お疲れさまでした, 来週のswは色んな意味で楽しみです , アルニキ 松ニキ..."


In [None]:
df_texts_by_minute["similarity"] = df_texts_by_minute["texts_by_minute"].apply(
    calc_average_similarity_word2vec)
df_texts_by_minute

In [None]:
df_texts_by_minute.to_csv('cosine_similarity_w2v_221212_ann_mon.csv', index=False)