In [3]:
import pickle
import torch.nn as nn
import torch
import numpy as np
import random
import time
from openai import OpenAI

In [4]:
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [5]:
with open("data/190325_data.pkl", "rb") as f:
    chat_data = pickle.load(f)
    
# Remove service chat
del chat_data[-1000]

In [6]:
print(f"Keys for game:\n{chat_data[next(iter(chat_data.keys()))].keys()}\n")
print(f"Keys for message:\n{chat_data[1895]["messages"][0].keys()}")

Keys for game:
dict_keys(['gameID', 'starttime', 'duration', 'botmodel', 'prompt', 'winner', 'createTS', 'updateTS', 'language', 'botname', 'messages', 'player_info'])

Keys for message:
dict_keys(['gameID', 'oldidx', 'color', 'userID', 'botID', 'message', 'create_time', 'colorID', 'messageidx'])


In [7]:
# This filters out games discussing the chat after the game is over
messages = []
labels = []
game_ids = []
for game_id, game_data in chat_data.items():
    for message in game_data["messages"]:
        if message["userID"] == "GameMaster":
            if "won" in message["message"] or "surrendered" in message["message"] or "canceled" in message["message"] or "lost" in message["message"] or "timed out" in message["message"] or "disconnected" in message["message"]:
                break
            else:
                continue
        messages.append(message["message"])
        labels.append([int(not message["botID"]), message["botID"]])
        game_ids.append(message["gameID"])
        
print(f"The dataset includes {len(messages)} messages from {len(list(chat_data.keys()))} games")

The dataset includes 22696 messages from 1559 games


In [8]:
# Example message with label
random_nr = random.randint(0, len(messages))
print(f"Message nr: {random_nr}\n")
print(messages[random_nr])
print(f"Message from human" if labels[random_nr][0] == 1 else "Message from bot")
print(f"\nMessage from game with id: {game_ids[random_nr]}")

Message nr: 20952

wie geht es deiner lieben mutti?
Message from human

Message from game with id: 3863


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")
client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:8000/v1"
)
embedding_model = "intfloat/multilingual-e5-large-instruct"

Running on cpu


In [10]:
then = time.time()
message_encodings = client.embeddings.create(input = messages, model=embedding_model)
message_encodings = [message.embedding for message in message_encodings.data]
print(f"Took {time.time() - then:.2f}s")

with open('data/message_encodings.pkl', 'wb') as f:
    pickle.dump(message_encodings, f)
with open('data/labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

Took 28.69s
