In [1]:
import pandas as pd
import numpy as np
import csv
import MeCab
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import neologdn
import demoji
import emoji

# model_dir = '/Users/iomacbookair2/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_dir = '/Users/labimac/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_word2vec = KeyedVectors.load_word2vec_format(model_dir, binary=True)

In [2]:
def calc_similarity_word2vec(sentence1, sentence2, model):
  mecab = MeCab.Tagger(
      '-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd')

  pre_sentence1 = sentence1.split(" ")
  sentence1_words = []
  for s1 in pre_sentence1:
    sentence1_words += [line.split("\t")[0]
                        for line in mecab.parse(s1).split("\n")[:-2]]
  # print(f"sentence1_words: {sentence1_words}")

  pre_sentence2 = sentence2.split(" ")
  sentence2_words = []
  for s2 in pre_sentence2:
    sentence2_words += [line.split("\t")[0]
                        for line in mecab.parse(s2).split("\n")[:-2]]
  # print(f"sentence2_words: {sentence2_words}")

  if not sentence1_words and not sentence2_words:
    similarity = 1
  elif not sentence1_words or not sentence2_words:
    similarity = 0
  elif (len(sentence1_words) == 1 and len(sentence2_words) != 1) or (len(sentence2_words) == 1 and len(sentence1_words) != 1):
    similarity = 0
  elif all(re.match(r'.*[a-zA-Z].*', word) for word in sentence1_words) or all(re.match(r'.*[a-zA-Z].*', word) for word in sentence2_words):
    similarity = 0
  else:
    # Compute word embeddings for each sentence
    sentence1_embedding = np.mean([model[word]
                                  for word in sentence1_words if word in model], axis=0)
    # print(f"sentence1_embedding: {sentence1_embedding}")
    sentence2_embedding = np.mean([model[word]
                                  for word in sentence2_words if word in model], axis=0)
    # print(f"sentence2_embedding: {sentence2_embedding}")
    if np.isnan(sentence1_embedding).any() or np.isnan(sentence2_embedding).any():
        similarity = 0
    else:
      similarity = cosine_similarity(
          [sentence1_embedding], [sentence2_embedding])[0][0]

  # print(f"similarity: {similarity}")

  return similarity


def calc_average_similarity_word2vec(sentences, key):
    similarities = []
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = calc_similarity_word2vec(
                sentences[i], sentences[j], model_word2vec)
            similarities.append(similarity)

    q = [0, 0.25, 0.5, 0.75, 1]
    outputs = {}
    for i in range(len(q)):
        outputs[f"quantile_{i}"] = np.quantile(similarities, q[i])
    outputs["average_similarity"] = np.mean(similarities)
    outputs["size"] = len(list(np.sort(np.array(similarities))))
    outputs["width_q"] = outputs["quantile_3"] - outputs["quantile_1"]
    outputs["variance"] = np.var(similarities)

    if key == "quantile_3":
        return outputs["quantile_3"]
    elif key == "average":
        return outputs["average_similarity"]
    elif key == "size":
        return outputs["size"]
    elif key == "width":
        return outputs["width_q"]
    elif key == "variance":
        return outputs["variance"]


In [3]:
# path = "/Users/iomacbookair2/Documents/lab/DEIM2023/tweet_csv/221217_ann_sat.csv"
path = "/Users/labimac/Documents/lab/DEIM2023/tweet_csv/221217_ann_sat.csv"
df = pd.read_csv((path))
df.sort_values(by = 'created_at', ascending = True, inplace = True)
df = df.reset_index(drop=True)
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.drop("author_id", axis=1)
df = df.drop("username", axis=1)

df.to_csv("sorted.csv", index=False)
df

Unnamed: 0,created_at,text
0,2022-12-18 01:00:00,#annkw
1,2022-12-18 01:00:00,#annkw
2,2022-12-18 01:00:00,‰ªäÂπ¥ÊúÄÂæå„ÅÆ„Å≤„Å®„Å§„Çà„Åó„Å™„Å´ #annkw
3,2022-12-18 01:00:00,#annkw
4,2022-12-18 01:00:00,#annkw
...,...,...
6557,2022-12-18 03:04:47,„Åæ„ÅüÊù•Âπ¥ÔºÅ„Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„ÇìÔºÅ„Ç¢„Éá„Ç£„Ç™„ÇπÔºÅ #annkw
6558,2022-12-18 03:04:49,„ÅÇ„Åß„ÅÉ„Åä„Åô #annkw
6559,2022-12-18 03:04:50,„Çπ„Ç´„Ç∑Ëä∏‰∫∫ü§£ü§£üëèüëè #annkw
6560,2022-12-18 03:04:50,„Ç≠„É•„Éº„ÉêÊú¨„ÅÆ„Å®„Åç„ÅÆ„Ç§„É≥„Çø„Éì„É•„Ç¢„Éº„Å®ÂÜç‰ºö„Åó„ÅüËã•Êûó„ÄÅBS„ÅÆÁï™ÁµÑ„ÄÅËã•Êûó„ÅÆË°ó„Éñ„É©„ÄÅÈñ¢Êù±„Çπ„Ç´„Ç∑Ëä∏‰∫∫„Éë„Éº...


In [4]:
def preprocess(text):
    text = emoji.replace_emoji(text, replace=' ')
    text = neologdn.normalize(text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'ÔºÉ\S+', '', text)
    text = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text)
    text = re.sub(r'https?://\S+', ' ', text)
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text)
    text = re.sub(
        u'[‚ñ†-‚ôØ„Äê„Äë„Äå„Äç„Äé„Äè„Éª„ÖÇÔæüÀä·óú„ÄÅ„ÄÇ‚àÄ„Äá‚ï∞Àãœâ‚Ä¶‚ï≠¬¥ÔΩÄ‚Ä¢Àò–¥‚ÜëËâ∏‚ïØ‚Üí¬∞–¥ÃÄ·¥óÀÉÀÇ‚ÅΩ‚ÅæœÜÔºº‚ÄªÂΩ°ñ•¶‚ÜêÍÇπ]', ' ', text)
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text)
    text = re.sub(r'\d+', '0', text)
    text = text.lower()
    text = re.sub(r"[\u3000\t\r\n]", " ", text)
    return text


df['text'] = df['text'].apply(preprocess)
df
df.to_csv('filtered3.csv', index=False)


In [5]:
groups = df.groupby(pd.Grouper(key='created_at', freq='min'))
df_texts_by_minute = pd.DataFrame({
    "texts_by_minute": groups.apply(lambda x: x["text"].tolist())
})
df_texts_by_minute = df_texts_by_minute.reset_index()
df_texts_by_minute
# df_texts_by_minute.to_csv("texts_by_minute.csv", index=False)

Unnamed: 0,created_at,texts_by_minute
0,2022-12-18 01:00:00,"[, , ‰ªäÂπ¥ÊúÄÂæå„ÅÆ„Å≤„Å®„Å§„Çà„Åó„Å™„Å´, , , , , , , , , , , , , , ,..."
1,2022-12-18 01:01:00,"[0Âπ¥ÊúÄÂæå„ÅÆÊîæÈÄÅ, , , „Åì„Çì„Å∞„Çì„ÅØ , ‰ªäÂπ¥„É©„Çπ„Éà„Å≤„Å®„Å§„Çà„Åó„Å™„Å´, , , , , ..."
2,2022-12-18 01:02:00,"[, „Ç¢„Éê„Éà„Ç•„Éº„Çπ0years after , , ‰ªäÂπ¥ÊúÄÂæå„Åã , „ÇÅ„Çì„Å©„Åè„Åõ„Åá„Å™„ÅÅ, „Å≤..."
3,2022-12-18 01:03:00,[‰ªäÂπ¥„Å©„ÅÜ„Å†„Å£„Åü „ÅåÈù¢ÂÄí„Åè„Åï„ÅÑwËã• ËÅû„Åè„Åì„Å®„Å™„ÅÑ„Åã„ÇâËÅû„ÅÑ„Å¶„Çã„Çì„Å†„Çà Êò• Êöë„Åè„Å™„Å£„Åü„Çà„Å≠„Å®‰∏ÄÁ∑í...
4,2022-12-18 01:04:00,"[„Åù„Çì„Å™„Å´„Çπ„Éà„É¨„ÇπÊ∫ú„Åæ„Å£„Å¶„Åü„ÅÆ„Åã ÂçàÂâç0ÊôÇ„ÅÆÁÑºËÇâÈ£ü„Å£„Å¶„Å™„Åã„Å£„Åü„ÇâÊ≠ª„Çì„Åß„Åü„Å≠ , „Çø„Éê„Ç≥Ë≤∑„Å£„Åü..."
...,...,...
120,2022-12-18 03:00:00,"[„Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„Çì‰ªäÂπ¥„ÇÇ0Âπ¥„ÅÇ„Çä„Åå„Å®„ÅÜ„Åî„Åñ„ÅÑ„Åæ„Åó„Åü, Â§ß„Åç„Å™Âãï„Åç„Åå„ÅÇ„Çã„ÅÆ„Åã ‚Åà, ‰ªä..."
121,2022-12-18 03:01:00,"[ , Êù•Âπ¥„ÇÇ„Çà„Çç„Åó„Åè„ÅäÈ°ò„ÅÑ„Åó„Åæ„Åô „Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„Çì„Ç¢„Éá„Ç£„Ç™„Çπ , , , √ó ..."
122,2022-12-18 03:02:00,"[, Ê¨°Êù•Âπ¥„ÅãÊ•Ω„Åó„Åø„Å´„Åó„Å¶„Åæ„Åô „Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„Çì „Ç¢„Éá„Ç£„Ç™„Çπ , ‰ªäÂπ¥ÊúÄÂæå„ÅÆ ÊúüÂæÖ ..."
123,2022-12-18 03:03:00,[Âπ¥ÂÜÖ„É©„Çπ„Éà „ÅäÁñ≤„Çå„Åó„ÅüÂ∞ë„ÅóÊó©„ÅÑ„Åß„Åô„Åå l t „ÅÆÁöÜ„Åï„Åæ ËâØ„ÅÑ„ÅäÂπ¥„Çí „Åæ„ÅüÊù•Âπ¥„ÇÇÊ∑±Â§ú„Å´„Åü„Åè„Åï...


In [6]:
df_texts_by_minute["median"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "quantile_3"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [7]:
df_texts_by_minute["average"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "average"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [8]:
df_texts_by_minute["size"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "size"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [9]:
df_texts_by_minute["width"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "width"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [10]:
df_texts_by_minute["variance"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "variance"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [11]:
df_texts_by_minute

Unnamed: 0,created_at,texts_by_minute,median,average,size,width,variance
0,2022-12-18 01:00:00,"[, , ‰ªäÂπ¥ÊúÄÂæå„ÅÆ„Å≤„Å®„Å§„Çà„Åó„Å™„Å´, , , , , , , , , , , , , , ,...",1.000000,0.592689,181503,1.000000,0.234481
1,2022-12-18 01:01:00,"[0Âπ¥ÊúÄÂæå„ÅÆÊîæÈÄÅ, , , „Åì„Çì„Å∞„Çì„ÅØ , ‰ªäÂπ¥„É©„Çπ„Éà„Å≤„Å®„Å§„Çà„Åó„Å™„Å´, , , , , ...",0.837228,0.324429,40755,0.837228,0.181347
2,2022-12-18 01:02:00,"[, „Ç¢„Éê„Éà„Ç•„Éº„Çπ0years after , , ‰ªäÂπ¥ÊúÄÂæå„Åã , „ÇÅ„Çì„Å©„Åè„Åõ„Åá„Å™„ÅÅ, „Å≤...",0.444468,0.245027,12561,0.444468,0.114222
3,2022-12-18 01:03:00,[‰ªäÂπ¥„Å©„ÅÜ„Å†„Å£„Åü „ÅåÈù¢ÂÄí„Åè„Åï„ÅÑwËã• ËÅû„Åè„Åì„Å®„Å™„ÅÑ„Åã„ÇâËÅû„ÅÑ„Å¶„Çã„Çì„Å†„Çà Êò• Êöë„Åè„Å™„Å£„Åü„Çà„Å≠„Å®‰∏ÄÁ∑í...,0.558588,0.324761,4753,0.558588,0.089931
4,2022-12-18 01:04:00,"[„Åù„Çì„Å™„Å´„Çπ„Éà„É¨„ÇπÊ∫ú„Åæ„Å£„Å¶„Åü„ÅÆ„Åã ÂçàÂâç0ÊôÇ„ÅÆÁÑºËÇâÈ£ü„Å£„Å¶„Å™„Åã„Å£„Åü„ÇâÊ≠ª„Çì„Åß„Åü„Å≠ , „Çø„Éê„Ç≥Ë≤∑„Å£„Åü...",0.558502,0.328908,5565,0.558502,0.079215
...,...,...,...,...,...,...,...
120,2022-12-18 03:00:00,"[„Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„Çì‰ªäÂπ¥„ÇÇ0Âπ¥„ÅÇ„Çä„Åå„Å®„ÅÜ„Åî„Åñ„ÅÑ„Åæ„Åó„Åü, Â§ß„Åç„Å™Âãï„Åç„Åå„ÅÇ„Çã„ÅÆ„Åã ‚Åà, ‰ªä...",0.726675,0.500165,2278,0.445026,0.083709
121,2022-12-18 03:01:00,"[ , Êù•Âπ¥„ÇÇ„Çà„Çç„Åó„Åè„ÅäÈ°ò„ÅÑ„Åó„Åæ„Åô „Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„Çì„Ç¢„Éá„Ç£„Ç™„Çπ , , , √ó ...",0.801375,0.492935,435,0.801375,0.140329
122,2022-12-18 03:02:00,"[, Ê¨°Êù•Âπ¥„ÅãÊ•Ω„Åó„Åø„Å´„Åó„Å¶„Åæ„Åô „Éü„ÉÉ„Éï„Ç£„Éº„Å°„ÇÉ„Çì „Ç¢„Éá„Ç£„Ç™„Çπ , ‰ªäÂπ¥ÊúÄÂæå„ÅÆ ÊúüÂæÖ ...",0.741721,0.500327,210,0.466419,0.090802
123,2022-12-18 03:03:00,[Âπ¥ÂÜÖ„É©„Çπ„Éà „ÅäÁñ≤„Çå„Åó„ÅüÂ∞ë„ÅóÊó©„ÅÑ„Åß„Åô„Åå l t „ÅÆÁöÜ„Åï„Åæ ËâØ„ÅÑ„ÅäÂπ¥„Çí „Åæ„ÅüÊù•Âπ¥„ÇÇÊ∑±Â§ú„Å´„Åü„Åè„Åï...,0.730090,0.561285,45,0.299582,0.041671


In [12]:
df_texts_by_minute.to_csv('cosine_similarity_w2v_221217_ann_sat.csv', index=False)