## Funcions

In [1]:
from pprint import pprint
import numpy as np
import json
import os
from omegaconf import OmegaConf

conf = OmegaConf.load("config.yaml")


In [2]:

def get_chat_msgs(i):
    path_to_chat, number_of_chats = conf.data.path_to_chat, conf.data.number_of_chats
    assert i < number_of_chats, f"i should be less than {number_of_chats}"
    assert i >= 0, f"i should be greater than 0"
    path_to_chat = path_to_chat.format(i)

    with open(path_to_chat, "r") as f:
        chat = json.load(f)
    return chat["messages"]

msgs = get_chat_msgs(5)

In [3]:
def extract_fields(msgs):
    fields = ["text", "from", "date"]
    msgs = [{k: msg[k] for k in fields} for msg in msgs if msg["type"] == "message"]
    return msgs

msgs = extract_fields(msgs)

In [4]:
# convert date to day (datetime object)
from datetime import datetime



def convert_to_date(msgs):
    for msg in msgs:
        # from str 2021-09-17T22:03:43
        msg["date"] = datetime.strptime(msg["date"], "%Y-%m-%dT%H:%M:%S").date()
    return msgs

msgs = convert_to_date(msgs)
pprint(msgs[:5])

[{'date': datetime.date(2021, 12, 9), 'from': 'Rodion Khvorostov', 'text': ''},
 {'date': datetime.date(2021, 12, 9),
  'from': 'Rodion Khvorostov',
  'text': [{'text': 'https://stepik.org/course/%D0%9B%D0%B8%D0%BA%D0%B1%D0%B5%D0%B7-%D0%BF%D0%BE-%D0%B4%D0%B8%D1%81%D0%BA%D1%80%D0%B5%D1%82%D0%BD%D0%BE%D0%B9-%D0%BC%D0%B0%D1%82%D0%B5%D0%BC%D0%B0%D1%82%D0%B8%D0%BA%D0%B5-91',
            'type': 'link'}]},
 {'date': datetime.date(2021, 12, 9), 'from': 'Любимая сестрёнка💜', 'text': ''},
 {'date': datetime.date(2021, 12, 9), 'from': 'Любимая сестрёнка💜', 'text': ''},
 {'date': datetime.date(2021, 12, 9), 'from': 'Любимая сестрёнка💜', 'text': ''}]


In [5]:


def fix_texts(msgs):
    for i in range(len(msgs)):
        txt = msgs[i]["text"]
        if type(txt) != str:
            # ['Может, сработает))\n\n', {'type': 'link', 'text': 'https://contest.yandex.ru/contest/34639/problems/A/'}, '\n\nно будь осторожен с этой ссылкой']
            # -> 'Может, сработает))\n\n{link: https://contest.yandex.ru/contest/34639/problems/A/'}\n\nно будь осторожен с этой ссылкой'
            new_txt = ""
            for elem in txt:
                if type(elem) == str:
                    new_txt += elem
                elif type(elem) == dict:
                    new_txt = f"{elem['type']}: {elem['text']}"
            msgs[i]["text"] = new_txt
    return msgs

msgs = fix_texts(msgs)

In [6]:
def remove_empty(msgs):
    msgs = list(filter(lambda msg: msg["text"] or type(msg['text']) != str, msgs))
    return msgs

# remove empty messages
print(f"Before: {len(msgs)}")
msgs = remove_empty(msgs)
print(f"After: {len(msgs)}")

Before: 908
After: 794


In [7]:
conf.actors = ['me', 'you']

In [8]:
# extract actor names and change them on me, friend

def change_names(msgs):
    actors_names = set([msg["from"] for msg in msgs])
    new_actors_names = conf.actors
    actor_to_name = {actor: new_actors_names[i] for i, actor in enumerate(actors_names)}
    msgs = [{**msg, "from": actor_to_name[msg["from"]]} for msg in msgs]
    return msgs

msgs = change_names(msgs)
pprint(msgs[:5])

[{'date': datetime.date(2021, 12, 9),
  'from': 'me',
  'text': 'link: '
          'https://stepik.org/course/%D0%9B%D0%B8%D0%BA%D0%B1%D0%B5%D0%B7-%D0%BF%D0%BE-%D0%B4%D0%B8%D1%81%D0%BA%D1%80%D0%B5%D1%82%D0%BD%D0%BE%D0%B9-%D0%BC%D0%B0%D1%82%D0%B5%D0%BC%D0%B0%D1%82%D0%B8%D0%BA%D0%B5-91'},
 {'date': datetime.date(2022, 2, 18), 'from': 'me', 'text': 'Люб, привет!'},
 {'date': datetime.date(2022, 2, 18), 'from': 'me', 'text': 'Там всё хорошо?'},
 {'date': datetime.date(2022, 2, 18),
  'from': 'me',
  'text': 'Что-то мама не отвечает...'},
 {'date': datetime.date(2022, 2, 19), 'from': 'you', 'text': 'Привет'}]


In [9]:
assert all([type(msg["text"]) == str for msg in msgs]), "Not all messages are strings"
assert all([bool(msg["text"]) for msg in msgs]), "Not all messages are non-empty strings"
# more than 10 "link: " messages
assert len([msg for msg in msgs if "link: " in msg["text"]]) > 3, "Not enough links"
assert not any([msg["from"] == "Rodion Khvorostov" for msg in msgs]), "Rodion is here"

def pprint_rnd_elem(lst):
    pprint(lst[np.random.randint(len(lst))])
print("Number of messages: ", len(msgs))
pprint_rnd_elem(msgs)

Number of messages:  794
{'date': datetime.date(2022, 7, 18), 'from': 'you', 'text': 'Окей'}


In [10]:
# add pair (text, from) to dialogues and actors respectively while the dialogue is during the same day

def extract_dialogues(msgs):
    dialogues, actors = [], []
    n = len(msgs)
    i = 0
    while i < n:
        day = msgs[i]["date"]
        cur_dialogue, cur_actors = [], []
        while i < n and msgs[i]["date"] == day:
            cur_dialogue.append(msgs[i]["text"])
            cur_actors.append(msgs[i]["from"])
            i += 1
        dialogues.append(cur_dialogue)
        actors.append(cur_actors)

    data = {"dialogue": dialogues, "actors": actors}
    return data

data = extract_dialogues(msgs)
n = len(data["dialogue"])
print(f"Number of dialogues: {n}")

Number of dialogues: 91


In [11]:
import numpy as np



rnd = np.random.randint(n)
print(f"Random dialogue number: {rnd}")
pprint(data['dialogue'][rnd])
print(f"Actors: {data['actors'][rnd]}")

Random dialogue number: 30
['Привет)',
 'Как дела?',
 'Привет. По-тихоньку адаптируюсь:)\nУ тебя как? Началась учёба?',
 'Сегодня было собрание',
 'А учëба завтра начнëтся',
 'Но пока не понятно',
 'У нас завтра рисунок 8 часов',
 'Не понятно, что будем делать',
 'Ну ничего, желаю удачи!',
 'Спасибо) 💖',
 'Скинь номер телефона Дяди пожалуйста',
 'Спасибо']
Actors: ['you', 'you', 'me', 'you', 'you', 'you', 'you', 'you', 'me', 'you', 'you', 'you']


## Full dataset

In [12]:
def full_process(i):
    msgs = get_chat_msgs(i)
    msgs = extract_fields(msgs)
    msgs = convert_to_date(msgs)
    msgs = fix_texts(msgs)
    msgs = remove_empty(msgs)
    msgs = change_names(msgs)
    data = extract_dialogues(msgs)
    return data

def process_all():
    number_of_chats = conf.data.number_of_chats
    data = {
        "dialogue": [],
        "actors": []
    }
    for i in range(number_of_chats):
        data_local = full_process(i)
        data["dialogue"].extend(data_local["dialogue"])
        data["actors"].extend(data_local["actors"])
    return data

data = process_all()

In [13]:
from datasets import Dataset

dataset = Dataset.from_dict(data)



In [14]:
dataset

Dataset({
    features: ['dialogue', 'actors'],
    num_rows: 1187
})

In [20]:
import os
token = os.environ["HUGGING_FACE_HUB_TOKEN"]
assert token, "No token found"

In [21]:
from huggingface_hub import notebook_login, login

login(token=token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/user010/.cache/huggingface/token
Login successful


In [23]:
dataset.push_to_hub(conf.data.dataset_name, private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]