In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertJapaneseTokenizer, BertModel
import re
import neologdn
import demoji
import emoji

MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
path = "/Users/iomacbookair2/Documents/lab/DEIM2023/tweet_csv/221212_ann_mon.csv"
df = pd.read_csv((path))
df.sort_values(by = 'created_at', ascending = True, inplace = True)
df = df.reset_index(drop=True)
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.drop("author_id", axis=1)
df = df.drop("username", axis=1)

df.to_csv("sorted.csv", index=False)
df


Unnamed: 0,created_at,text
0,2022-12-13 01:00:00,#cnann
1,2022-12-13 01:00:00,#cnann
2,2022-12-13 01:00:00,#cnann
3,2022-12-13 01:00:00,#cnann
4,2022-12-13 01:00:00,#cnann
...,...,...
3193,2022-12-13 03:03:53,アルニキ、松ニキお疲れ様。 #cnann
3194,2022-12-13 03:03:53,来週、そこには劣等感が火を吹いて燃え尽きたDJニキが… #cnann
3195,2022-12-13 03:03:54,IOさんとKEIJUさんの呼び方はどうなるのか…。 そしてそのまま呼べるのか…。 お疲れ様で...
3196,2022-12-13 03:04:03,今日も楽しかったです😊 お疲れ様でした！ また来週〜👋 #cnann


In [3]:
# def remove_emoji(text):
#   emoji_pattern = re.compile("["
#                              u"\U0001F600-\U0001F64F"
#                              u"\U0001F300-\U0001F5FF"
#                              u"\U0001F680-\U0001F6FF"
#                              u"\U0001F1E0-\U0001F1FF"
#                              u"\U0001FA70-\U0001FAFF"
#                              "]+", flags=re.UNICODE)
#   return emoji_pattern.sub(r' ', text)

def preprocess(text):
    text = emoji.replace_emoji(text, replace=' ')
    # text = re.sub(u'[\U0001F979]', '', text)
    # text = demoji.replace(string=text, repl=' ')
    # text = remove_emoji(text)
    text = neologdn.normalize(text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'＃\S+', '', text)
    text = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text)
    text = re.sub(r'https?://\S+', ' ', text)
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text)
    text = re.sub(u'[■-♯【】「」『』・]', ' ', text)
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text)
    text = re.sub(r'\d+', '0', text)
    text = text.lower()
    text = re.sub(r"[\u3000\t\r\n]", " ", text)
    return text


df['text'] = df['text'].apply(preprocess)
df
df.to_csv('filtered3.csv', index=False)


In [4]:
groups = df.groupby(pd.Grouper(key='created_at', freq='min'))
df_texts_by_minute = pd.DataFrame({
    "texts_by_minute": groups.apply(lambda x: x["text"].tolist())
})
df_texts_by_minute = df_texts_by_minute.reset_index()
df_texts_by_minute
# df_texts_by_minute.to_csv("texts_by_minute.csv", index=False)

Unnamed: 0,created_at,texts_by_minute
0,2022-12-13 01:00:00,"[, , , , , , , , , , , , , , , , , , , , creep..."
1,2022-12-13 01:01:00,"[, 今年の漢字は 戰 , , , , 今年の漢字 戦 , , , , , , , 戦, ,..."
2,2022-12-13 01:02:00,"[, , , , , 韻は無敵すぎw , 今年の漢字 戦 , 今年の漢字… 韻 でしのぐ笑,..."
3,2022-12-13 01:03:00,"[今年の漢字やっぱりきた , 今年の漢字、0人はずっと皿と韻, 繋はオフィシャル過ぎる, ..."
4,2022-12-13 01:04:00,"[真摯に向き合った白紙, 韻、皿、擦、繋。, 今年の漢字rさん 韻 松永さん 皿 擦 繋..."
...,...,...
120,2022-12-13 03:00:00,"[お疲れ様でした来週のspw楽しみにしてます , お疲れっしたーわちゃわちゃ良かったよ,..."
121,2022-12-13 03:01:00,"[0人だけ回お疲れ様でした , お疲れ様でした, お疲れ様でした おやすみなさい , お..."
122,2022-12-13 03:02:00,"[お疲れ様でした, , r 指定と、俺 お疲れさまでしたー, おれお疲れ様でした , , ..."
123,2022-12-13 03:03:00,"[超楽しかった お疲れさまでした, 来週のswは色んな意味で楽しみです。, アルニキ、松ニキ..."


In [5]:
def sentence_to_vector(model, tokenizer, sentence):
    tokens = tokenizer(sentence, add_special_tokens=True)["input_ids"]
    input = torch.tensor(tokens).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input, output_hidden_states=True)
        last_hidden_state = outputs[0][:, 0, :]
        averaged_hidden_state = last_hidden_state.mean(dim=0).unsqueeze(0)
    return averaged_hidden_state


def calc_similarity(sentence1, sentence2):
    print("{}\n{}".format(sentence1, sentence2))

    sentence_vector1 = sentence_to_vector(model, tokenizer, sentence1)
    sentence_vector2 = sentence_to_vector(model, tokenizer, sentence2)

    # Reshape the tensors to 1D
    sentence_vector1 = sentence_vector1.reshape(-1)
    sentence_vector2 = sentence_vector2.reshape(-1)

    similarity = float(torch.nn.functional.cosine_similarity(
        sentence_vector1, sentence_vector2, dim=0).detach().numpy().copy())
    print("Similarity:", similarity)

    return similarity


def calc_average_similarity(sentences):
    similarities = []
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = calc_similarity(sentences[i], sentences[j])
            similarities.append(similarity)

    similarities = np.array(similarities)
    quartiles = np.quantile(similarities, [0, 0.25, 0.5, 0.75, 1])
    average_similarity = np.mean(similarities)

    return list(quartiles) + [average_similarity]


In [6]:
df_texts_by_minute["similarity"] = df_texts_by_minute["texts_by_minute"].apply(
    calc_average_similarity)
df_texts_by_minute




Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209

creepy nutsのオールナイトニッポン ニッポン放送  0 0 0 月0 0 0 0    
Similarity: 0.6685909628868103


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.9999997615814209


Similarity: 0.99999976158

Unnamed: 0,created_at,texts_by_minute,similarity
0,2022-12-13 01:00:00,"[, , , , , , , , , , , , , , , , , , , , creep...","[0.5286118388175964, 0.9999997615814209, 0.999..."
1,2022-12-13 01:01:00,"[, 今年の漢字は 戰 , , , , 今年の漢字 戦 , , , , , , , 戦, ,...","[0.5314908027648926, 0.7041782736778259, 0.999..."
2,2022-12-13 01:02:00,"[, , , , , 韻は無敵すぎw , 今年の漢字 戦 , 今年の漢字… 韻 でしのぐ笑,...","[0.4594917297363281, 0.6734775304794312, 0.712..."
3,2022-12-13 01:03:00,"[今年の漢字やっぱりきた , 今年の漢字、0人はずっと皿と韻, 繋はオフィシャル過ぎる, ...","[0.5139971375465393, 0.6781930327415466, 0.725..."
4,2022-12-13 01:04:00,"[真摯に向き合った白紙, 韻、皿、擦、繋。, 今年の漢字rさん 韻 松永さん 皿 擦 繋...","[0.5779921412467957, 0.6848501265048981, 0.718..."
...,...,...,...
120,2022-12-13 03:00:00,"[お疲れ様でした来週のspw楽しみにしてます , お疲れっしたーわちゃわちゃ良かったよ,...","[0.30040568113327026, 0.6458816975355148, 0.72..."
121,2022-12-13 03:01:00,"[0人だけ回お疲れ様でした , お疲れ様でした, お疲れ様でした おやすみなさい , お...","[0.46405428647994995, 0.6911193132400513, 0.76..."
122,2022-12-13 03:02:00,"[お疲れ様でした, , r 指定と、俺 お疲れさまでしたー, おれお疲れ様でした , , ...","[0.3860442340373993, 0.592319905757904, 0.6912..."
123,2022-12-13 03:03:00,"[超楽しかった お疲れさまでした, 来週のswは色んな意味で楽しみです。, アルニキ、松ニキ...","[0.6792345643043518, 0.783223882317543, 0.8178..."


In [7]:
df_texts_by_minute.to_csv('cosine_similarity2_221212_ann_mon.csv', index=False)