In [1]:
import pandas as pd
import numpy as np
import csv
import MeCab
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import neologdn
import demoji
import emoji

# model_dir = '/Users/iomacbookair2/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_dir = '/Users/labimac/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_word2vec = KeyedVectors.load_word2vec_format(model_dir, binary=True)

In [2]:
def calc_similarity_word2vec(sentence1, sentence2, model):
  mecab = MeCab.Tagger(
      '-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd')

  pre_sentence1 = sentence1.split(" ")
  sentence1_words = []
  for s1 in pre_sentence1:
    sentence1_words += [line.split("\t")[0]
                        for line in mecab.parse(s1).split("\n")[:-2]]
  # print(f"sentence1_words: {sentence1_words}")

  pre_sentence2 = sentence2.split(" ")
  sentence2_words = []
  for s2 in pre_sentence2:
    sentence2_words += [line.split("\t")[0]
                        for line in mecab.parse(s2).split("\n")[:-2]]
  # print(f"sentence2_words: {sentence2_words}")

  if not sentence1_words and not sentence2_words:
    similarity = 1
  elif not sentence1_words or not sentence2_words:
    similarity = 0
  elif (len(sentence1_words) == 1 and len(sentence2_words) != 1) or (len(sentence2_words) == 1 and len(sentence1_words) != 1):
    similarity = 0
  elif all(re.match(r'.*[a-zA-Z].*', word) for word in sentence1_words) or all(re.match(r'.*[a-zA-Z].*', word) for word in sentence2_words):
    similarity = 0
  else:
    # Compute word embeddings for each sentence
    sentence1_embedding = np.mean([model[word]
                                  for word in sentence1_words if word in model], axis=0)
    # print(f"sentence1_embedding: {sentence1_embedding}")
    sentence2_embedding = np.mean([model[word]
                                  for word in sentence2_words if word in model], axis=0)
    # print(f"sentence2_embedding: {sentence2_embedding}")
    if np.isnan(sentence1_embedding).any() or np.isnan(sentence2_embedding).any():
        similarity = 0
    else:
      similarity = cosine_similarity(
          [sentence1_embedding], [sentence2_embedding])[0][0]

  # print(f"similarity: {similarity}")

  return similarity


def calc_average_similarity_word2vec(sentences, key):
    similarities = []
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = calc_similarity_word2vec(
                sentences[i], sentences[j], model_word2vec)
            similarities.append(similarity)

    q = [0, 0.25, 0.5, 0.75, 1]
    outputs = {}
    for i in range(len(q)):
        outputs[f"quantile_{i}"] = np.quantile(similarities, q[i])
    outputs["average_similarity"] = np.mean(similarities)
    outputs["size"] = len(list(np.sort(np.array(similarities))))
    outputs["width_q"] = outputs["quantile_3"] - outputs["quantile_1"]
    outputs["standard_deviation"] = np.std(similarities)

    if key == "quantile_3":
        return outputs["quantile_3"]
    elif key == "average":
        return outputs["average_similarity"]
    elif key == "size":
        return outputs["size"]
    elif key == "width":
        return outputs["width_q"]
    elif key == "standard_deviation":
        return outputs["standard_deviation"]


In [3]:
# path = "/Users/iomacbookair2/Documents/lab/DEIM2023/tweet_csv/221214_ann_wed.csv"
path = "/Users/labimac/Documents/lab/DEIM2023/tweet_csv/221214_ann_wed.csv"
df = pd.read_csv((path))
df.sort_values(by = 'created_at', ascending = True, inplace = True)
df = df.reset_index(drop=True)
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.drop("author_id", axis=1)
df = df.drop("username", axis=1)

df.to_csv("sorted.csv", index=False)
df

Unnamed: 0,created_at,text
0,2022-12-15 01:00:00,#‰πÉÊú®ÂùÇ46ANN
1,2022-12-15 01:00:00,#‰πÉÊú®ÂùÇ46ANN
2,2022-12-15 01:00:00,#‰πÉÊú®ÂùÇ46ANN
3,2022-12-15 01:00:00,#‰πÉÊú®ÂùÇ46ANN
4,2022-12-15 01:00:00,#‰πÉÊú®ÂùÇ46ANN
...,...,...
13960,2022-12-15 03:03:55,„Åä„Åî„Çä„ÅÆ„Ç´„ÉÉ„Ç≥„ÅÑ„ÅÑÊâï„ÅÑÊñπ„Å£„Å¶Èõ£„Åó„ÅÑ„Åë„Å©„ÄÅ„Å†„Åã„Çâ„Åì„Åù„Åã„Å£„Åì„ÅÑ„ÅÑ„ÅÆ„Åã„Å™Ôºü „Å°„ÇÉ„Çì„Å®„Çπ„Éù„Éº„ÉÑË¶≥Êà¶„Åó...
13961,2022-12-15 03:03:56,„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü #‰πÉÊú®ÂùÇ46ANN
13962,2022-12-15 03:04:11,„Åó„Åä„Çä„Åï„Çì„ÄÅ„Çπ„Çø„ÉÉ„Éï„Åï„Çì„Ç™„ÉÑ„Ç´„É¨„Çµ„Éû„Éá„Ç∑„ÇøÔºÅÔºÅ ‰πÖ„ÄÖ„ÅÆ1‰∫∫Âñã„Çä„Çπ„Éù„Éº„ÉÑ„ÅÆË©±Ê∫ÄËºâ„Åß„Åó„Åü„ÅåËâ≤„Çì„Å™...
13963,2022-12-15 03:04:13,12/14„à¨#„ÅÇ„Å°„Åì„Å°„Ç™„Éº„Éâ„É™„Éº #‰πÉÊú®ÂùÇ46ANN #SPY„ÅÆ„Éú„Çπ„ÅÆÊ≠£‰Ωì„ÅØ‰Ωê‰πÖÈñìÂÆ£Ë°å #f...


In [4]:
def preprocess(text):
    text = emoji.replace_emoji(text, replace=' ')
    text = neologdn.normalize(text)
    text = re.sub(r'#\S+', '', text) # „Éè„ÉÉ„Ç∑„É•„Çø„Ç∞„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = re.sub(r'ÔºÉ\S+', '', text) # „Éè„ÉÉ„Ç∑„É•„Çø„Ç∞„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text) # URL„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = re.sub(r'https?://\S+', ' ', text) # URL„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text) # Ë®òÂè∑„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = re.sub(
        u'[‚ñ†-‚ôØ„Äê„Äë„Äå„Äç„Äé„Äè„Éª„ÖÇÔæüÀä·óú„ÄÅ„ÄÇ‚àÄ„Äá‚ï∞Àãœâ‚Ä¶‚ï≠¬¥ÔΩÄ‚Ä¢Àò–¥‚ÜëËâ∏‚ïØ‚Üí¬∞–¥ÃÄ·¥óÀÉÀÇ‚ÅΩ‚ÅæœÜl‚îîÔºº‚ÄªÂΩ°ñ•¶‚ÜêÍÇπ]', ' ', text) # Ë®òÂè∑„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text) # Â∞èÊï∞ÁÇπ„Å®„Ç´„É≥„Éû„ÇíÊ∂à„Åô
    text = re.sub(r'\d+', '0', text) # Êï∞Â≠ó„Çí0„Å´ÁΩÆ„ÅçÊèõ„Åà
    text = text.lower() # Ëã±Â≠ó„ÇíÂ∞èÊñáÂ≠ó„Å´
    text = re.sub(r"[\u3000\t\r\n]", " ", text) # Á©∫ÁôΩÊñáÂ≠ó„Çí„Çπ„Éö„Éº„Çπ„Å´ÁΩÆ„ÅçÊèõ„Åà
    return text


df['text'] = df['text'].apply(preprocess)
df
df.to_csv('filtered3.csv', index=False)


In [5]:
groups = df.groupby(pd.Grouper(key='created_at', freq='min'))
df_texts_by_minute = pd.DataFrame({
    "texts_by_minute": groups.apply(lambda x: x["text"].tolist())
})
df_texts_by_minute = df_texts_by_minute.reset_index()
df_texts_by_minute
# df_texts_by_minute.to_csv("texts_by_minute.csv", index=False)

Unnamed: 0,created_at,texts_by_minute
0,2022-12-15 01:00:00,"[, , , , , , , , , , , , , „Åï„ÅÇ ‰ªäÂ§ú„ÇÇ„ÅäÈ°ò„ÅÑ„Åó„Åæ„Åô „ÅäÈ°ò„ÅÑ„Åó„Åæ„Åô..."
1,2022-12-15 01:01:00,"[ , , , , , , , , , „Åì„Çì„Å™„Çì„Å™„Çì„Åº„ÅÇ„Å£„Å¶„ÇÇ„ÅÑ„ÅÑ„Åß„Åô„Åã„Çâ„Å≠, , , , ..."
2,2022-12-15 01:02:00,"[, Ê£ÆÈ´òÂçÉÈáå„Åï„Çì„Åã„ÇâÂèØÊÑõ„ÅÑ„Çí„ÅÑ„Åü„Å†„ÅÑ„Åü‰πÖ‰øù„Å°„ÇÉ„Çì Á¨ë ÂèØÊÑõ„ÅÑ„Å™„Çì„Å¶„Å™„Çì„ÅºË®Ä„Çè„Çå„Å¶„ÇÇ„ÅÑ„ÅÑ„Çà„Å≠..."
3,2022-12-15 01:03:00,"[„ÇÑ„Åã„Åæ„Åó„ÅÑ„Çè, ‰πÖ‰øù„Å°„ÇÉ„Çì „Åù„ÇåËÅû„ÅÑ„Å°„ÇÉ„Åä„Åó„Åæ„ÅÑ„Çà, , „ÇØ„É™„Çπ„Éû„Çπ„Åì„ÅÆ„Åæ„Åæ„Å†„Å®„Éú„ÉÉ„ÉÅ ,..."
4,2022-12-15 01:04:00,"[„ÅÇ„ÅÆÊ£ÆÈ´òÂçÉÈáå„Åï„Çì„Åã„Çâ „Åã„Çè„ÅÑ„ÅÑ È†Ç„Åç„Åæ„Åó„Åüw , , Ê£ÆÈ´òÂçÉÈáå„Åï„ÇìÁæé„Åó„ÅÑ„Çà„Å≠, „ÇØ„É™„Éë„Åß„ÇÇ..."
...,...,...
120,2022-12-15 03:00:00,[‰ªäÈÄ±„ÇÇ„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü „Åã„Å£„Åì„Å§„Åë„Çà„ÅÜ„Å®„Åó„Å¶ Â§±Êïó„Åó„Å¶„Åó„Åæ„Å£„Åü„ÅÆ„Åå„Åè„Åº„Å°„ÇÉ„Çì„Çâ„Åó„ÅÑ„Åß„ÅôÁ¨ëÊ¨°Âõû...
121,2022-12-15 03:01:00,[ ‰πÉÊú®ÂùÇ0„ÅÆ„Ç™„Éº„É´„Éä„Ç§„Éà„Éã„ÉÉ„Éù„É≥ ÂçíÊ•≠„ÇíÁô∫Ë°®„Åó„Å¶„ÅÑ„ÇãÈΩãËó§È£õÈ≥•„ÅÆÁîüÂá∫Êºî„ÅåÊ±∫ÂÆö „Éë„Éº„ÇΩ„Éä„É™„ÉÜ...
122,2022-12-15 03:02:00,"[„Åæ„Åü„Å≠„Éº, „Åó„Åä„Çä„Çì„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü Êù•ÈÄ±„ÇÇÁµ∂ÂØæËÅ¥„Åè „Åæ„Åü„Å≠ „Éé, ‰ªäÊó•„ÇÇÂÆâÂÆö„ÅÆÁàÜÁ¨ë..."
123,2022-12-15 03:03:00,"[ÊïóËÄÖÂæ©Ê¥ª„Åß„ÇÑ„Çã„Çà„ÅÜ„Å™Â†¥ÊâÄ„Åò„ÇÉ„Å™„ÅÑ, ‰πÖ‰øù„Å°„ÇÉ„Çì„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü„ÇΩ„É≠„ÅØ„Å≤„Åï„Å≥„Åï„Åß„Åó„Åü„Å≠Êù•ÈÄ±„ÅØ„Å®..."


In [6]:
df_texts_by_minute["median"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "quantile_3"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [7]:
df_texts_by_minute["average"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "average"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [8]:
df_texts_by_minute["size"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "size"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [9]:
df_texts_by_minute["width"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "width"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [10]:
df_texts_by_minute["standard_deviation"] = df_texts_by_minute["texts_by_minute"].apply(
    lambda x: 0 if len(x) <= 1 else calc_average_similarity_word2vec(x, "standard_deviation"))


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [11]:
df_texts_by_minute

Unnamed: 0,created_at,texts_by_minute,median,average,size,width,standard_deviation
0,2022-12-15 01:00:00,"[, , , , , , , , , , , , , „Åï„ÅÇ ‰ªäÂ§ú„ÇÇ„ÅäÈ°ò„ÅÑ„Åó„Åæ„Åô „ÅäÈ°ò„ÅÑ„Åó„Åæ„Åô...",1.000000,0.887551,222778,0.000000,0.315150
1,2022-12-15 01:01:00,"[ , , , , , , , , , „Åì„Çì„Å™„Çì„Å™„Çì„Åº„ÅÇ„Å£„Å¶„ÇÇ„ÅÑ„ÅÑ„Åß„Åô„Åã„Çâ„Å≠, , , , ...",1.000000,0.476690,20706,1.000000,0.473239
2,2022-12-15 01:02:00,"[, Ê£ÆÈ´òÂçÉÈáå„Åï„Çì„Åã„ÇâÂèØÊÑõ„ÅÑ„Çí„ÅÑ„Åü„Å†„ÅÑ„Åü‰πÖ‰øù„Å°„ÇÉ„Çì Á¨ë ÂèØÊÑõ„ÅÑ„Å™„Çì„Å¶„Å™„Çì„ÅºË®Ä„Çè„Çå„Å¶„ÇÇ„ÅÑ„ÅÑ„Çà„Å≠...",0.671236,0.354342,14535,0.671236,0.367582
3,2022-12-15 01:03:00,"[„ÇÑ„Åã„Åæ„Åó„ÅÑ„Çè, ‰πÖ‰øù„Å°„ÇÉ„Çì „Åù„ÇåËÅû„ÅÑ„Å°„ÇÉ„Åä„Åó„Åæ„ÅÑ„Çà, , „ÇØ„É™„Çπ„Éû„Çπ„Åì„ÅÆ„Åæ„Åæ„Å†„Å®„Éú„ÉÉ„ÉÅ ,...",0.597014,0.336402,12720,0.597014,0.305982
4,2022-12-15 01:04:00,"[„ÅÇ„ÅÆÊ£ÆÈ´òÂçÉÈáå„Åï„Çì„Åã„Çâ „Åã„Çè„ÅÑ„ÅÑ È†Ç„Åç„Åæ„Åó„Åüw , , Ê£ÆÈ´òÂçÉÈáå„Åï„ÇìÁæé„Åó„ÅÑ„Çà„Å≠, „ÇØ„É™„Éë„Åß„ÇÇ...",0.603410,0.354516,12246,0.603410,0.302588
...,...,...,...,...,...,...,...
120,2022-12-15 03:00:00,[‰ªäÈÄ±„ÇÇ„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü „Åã„Å£„Åì„Å§„Åë„Çà„ÅÜ„Å®„Åó„Å¶ Â§±Êïó„Åó„Å¶„Åó„Åæ„Å£„Åü„ÅÆ„Åå„Åè„Åº„Å°„ÇÉ„Çì„Çâ„Åó„ÅÑ„Åß„ÅôÁ¨ëÊ¨°Âõû...,0.243878,0.160540,12720,0.243878,0.288074
121,2022-12-15 03:01:00,[ ‰πÉÊú®ÂùÇ0„ÅÆ„Ç™„Éº„É´„Éä„Ç§„Éà„Éã„ÉÉ„Éù„É≥ ÂçíÊ•≠„ÇíÁô∫Ë°®„Åó„Å¶„ÅÑ„ÇãÈΩãËó§È£õÈ≥•„ÅÆÁîüÂá∫Êºî„ÅåÊ±∫ÂÆö „Éë„Éº„ÇΩ„Éä„É™„ÉÜ...,0.510457,0.225801,3003,0.510457,0.313578
122,2022-12-15 03:02:00,"[„Åæ„Åü„Å≠„Éº, „Åó„Åä„Çä„Çì„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü Êù•ÈÄ±„ÇÇÁµ∂ÂØæËÅ¥„Åè „Åæ„Åü„Å≠ „Éé, ‰ªäÊó•„ÇÇÂÆâÂÆö„ÅÆÁàÜÁ¨ë...",0.713293,0.316734,465,0.713293,0.353925
123,2022-12-15 03:03:00,"[ÊïóËÄÖÂæ©Ê¥ª„Åß„ÇÑ„Çã„Çà„ÅÜ„Å™Â†¥ÊâÄ„Åò„ÇÉ„Å™„ÅÑ, ‰πÖ‰øù„Å°„ÇÉ„Çì„ÅäÁñ≤„ÇåÊßò„Åß„Åó„Åü„ÇΩ„É≠„ÅØ„Å≤„Åï„Å≥„Åï„Åß„Åó„Åü„Å≠Êù•ÈÄ±„ÅØ„Å®...",0.749243,0.522638,36,0.346227,0.304620


In [12]:
df_texts_by_minute.to_csv('cosine_similarity_w2v_221214_ann_wed.csv', index=False)