In [1]:
import json
import pandas as pd

import os
import glob
import uuid
from pathlib import Path

from pprint import pprint

# Filters

In [2]:
# filters from prod


def request_number(message):
    words = message.split()
    forbidden = [
        "call",
        "phone",
        "number",
        "email",
        "emails",
        "exchange",
        "share",
        "skype",
        "whatsapp",
        "facebook",
        "leave this app",
        "talk on the phone",
        "hear your voice",
        "your voice",
        "hear you",
        "to message me",
        "to text me at",
    ]
    for s in forbidden:
        if s in words:
            return True
    return False


def request_date(message):
    forbidden = [
        "let's go",
        "lets go",
        "let's meet",
        "lets meet",
        "let's visit",
        "let's date",
        "let's hang out",
        "meet",
        "meet?",
        "Would you like to meet",
        "meet me",
        "date",
        "go out",
        "go on a date",
        "date you",
        "date with you",
        "offline",
        "visit",
        "meet you",
        "see you",
        "visit you",
        "in person",
        "join me",
        "hang out with me",
        "date me",
        "my place",
        "can come over",
        "Let's meet up." "Can we get together?",
        "face-to-face",
        "Wanna grab coffee",
        "grab lunch",
        "grab dinner?",
        "Meet for a drink?" "Let's catch up in person.",
        "Want to see each other?",
        "How about a real-life meetup?",
        "Are you free to hang out?",
        "I'd love to meet you in person.",
        "Shall we plan a meet-up?",
        "Would you like to meet for real?",
        "Time for an offline rendezvous?",
        "Let's take this conversation offline.",
        "Meeting IRL soon?",
        "IRL",
        "Care to meet face-to-face?" "Coffee date?" "Let's make plans to meet.",
        "Up for an in-person chat?",
        "Meeting in the physical world?",
        "We should connect offline.",
        "How about a meetup this weekend?",
        "Let's schedule a real-life encounter.",
        "I'd enjoy meeting you for real.",
        "Any chance we could meet?",
        "Want to meet somewhere?",
        "Let's arrange a live meeting.",
        "Meeting over a meal?",
        "Let's get together outside the digital world.",
        "I propose an in-person get-together.",
    ]
    for s in forbidden:
        if s in message:
            return True
    return False


def has_numbers(inputString) -> bool:
    return any(char.isdigit() for char in inputString)


def has_roleplay(inputString) -> bool:
    return "*" in inputString


FILTERS_MAP = {
    "has_roleplay": has_roleplay,
    "has_numbers": has_numbers,
    "request_date": request_date,
    "request_number": request_number,
}

# Preprocess EVA.AI filtered dataset

In [3]:
base_path = Path("../data")
eva_ai_path = base_path / "eva_ai_cleaned" / "clean"

train_path, test_path = eva_ai_path / "train.txt", eva_ai_path / "test.txt"

In [4]:
def read_data(path: Path):
    with open(path, "r") as f:
        text = f.read()
        dialogs = [d.split("\n") for d in text.split("<|endoftext|>") if d]
        dialogs = ["\n".join([d[0]] + [""] + d[1:]) for d in dialogs]
        text = "<|endoftext|>".join(dialogs)

    text = text.replace("<|endoftext|>", "<s>")
    dialogs = text.split("<s>")
    dialogs = ["<s>" + dialog.strip(" ") for dialog in dialogs if len(dialog) > 100]
    for i in range(len(dialogs)):
        if dialogs[i][-1] != "\n":
            dialogs[i] = dialogs[i] + "\n"

    new_dialogues = []
    for dialogue in dialogs:
        dialogue = dialogue.replace("<s>", "")
        dialogue = dialogue.split("\n")
        dialogue = [r for r in dialogue if len(r) > 0]
        bio_info = dialogue[0]
        dialogue = dialogue[1:]
        new_dialogue = []
        for i, sample in enumerate(dialogue):
            sample = sample.split(":")
            t = sample[0]
            r = ":".join(sample[1:])
            t, r = t.strip(), r.strip()
            filters_res = {}
            for key in FILTERS_MAP:
                filters_res[key] = FILTERS_MAP[key](r)
            new_dialogue.append({"speaker": t, "text": r, "filters": {"base_filters": filters_res}, "order_number": i})
        new_dialogues.append(
            {
                "bio_info": bio_info,
                "dialogue": new_dialogue,
                "source": "eva_ai",
                "instruction": "",
                "session_id": str(uuid.uuid4()),
            }
        )
    return new_dialogues


train_data = read_data(train_path)
test_data = read_data(test_path)

In [5]:
def filtered_data(data):
    filtered_data = []
    for sample in data:
        counter = 0
        for reply in sample["dialogue"]:
            flag = False
            for k in reply["filters"]["base_filters"]:
                flag = flag or reply["filters"]["base_filters"][k]
            if flag:
                counter += 1
        if counter < len(sample["dialogue"]) // 2:
            filtered_data.append(sample)
    return filtered_data


filtered_train_data, filtered_test_data = filtered_data(train_data), filtered_data(test_data)
print(len(train_data), len(filtered_train_data))
print(len(test_data), len(filtered_test_data))

1206 906
172 170


In [6]:
json_path = base_path / "eva_ai_cleaned" / "json_clean"


def dump_data(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


if not os.path.exists(json_path):
    os.mkdir(json_path)

dump_data(filtered_train_data, json_path / "train.json")
dump_data(filtered_train_data, json_path / "test.json")

# Preprocess Once dialogues dataset

In [7]:
base_path = Path("../data")
once_path = base_path / "once_dialogues" / "raw"

In [8]:
data_1 = pd.read_excel(once_path / "Once_dualogues_p_1.xlsx")
data_2 = pd.read_excel(once_path / "Once_dualogues_p_2.xlsx")

data_merged = data_1.to_dict(orient="records")
data_merged.extend(data_2.to_dict(orient="records"))

In [9]:
def data_to_dialogues(data):
    # unique_sess_ids = list(set([t['session_id'] for t in data]))
    new_data = {}
    for sample in data:
        sample["session_id"] = str(sample["session_id"])
        if sample["session_id"] in new_data:
            new_data[sample["session_id"]].append(sample)
        else:
            new_data[sample["session_id"]] = [
                sample,
            ]

    for key in new_data:
        new_data[key] = sorted(new_data[key], key=lambda x: x["order_number"])
    return list(new_data.items())


train_dialogues = data_to_dialogues(data_merged)

In [10]:
def prepare_dataset(data):
    new_data = []
    for sess_id, dialogue in data:
        sample = {}
        new_d = []
        speaker_map = {
            "Animator": "Alice",
            "User": "David",
        }
        for d in dialogue:
            if not isinstance(d["text"], str):
                d["text"] = "Hm, i don't know what to say"
            new_s = {
                "speaker": speaker_map[d["speaker"]],
                "text": d["text"],
                "filters": {
                    "base_filters": {key: FILTERS_MAP[key](d["text"]) for key in FILTERS_MAP},
                    "rank_filters": {
                        "rank_result": d["rank_result"] if d["rank_result"] != "first_message" else "ok",
                        "rules_result": d["rules_result"],
                        "total_score": d["total_score"],
                    },
                },
                "order_number": d["order_number"],
            }
            new_d.append(new_s)

        sample["bio_info"] = (
            "Alice is charming and beautiful girl. Alice is interested in David. David is looking for relatioships. This is a conversation between David and Alice."
        )
        sample["dialogue"] = new_d
        sample["source"] = "once_dump"
        sample["session_id"] = str(sess_id)
        sample["instruction"] = ""
        new_data.append(sample)
    return new_data


new_train_data = prepare_dataset(train_dialogues)

In [11]:
json_path = base_path / "once_dialogues" / "json_clean"


def dump_data(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


if not os.path.exists(json_path):
    os.mkdir(json_path)

dump_data(new_train_data, json_path / "train.json")

# Process Once FR dialogues

In [1]:
from pathlib import Path
import pandas as pd


base_path = Path("../data")
once_path = base_path / "once_dialogues_fr" / "raw"

forbidden_topics_df = pd.read_excel(once_path / "Dataset_Once_forbidden_topics_FR.xlsx")[
    ["Rank", "Sender", "Text", "Topic"]
]
once_main_fem_df = pd.read_excel(once_path / "Dataset_Once_MAIN_FR.xlsx", sheet_name="User FEM")[
    ["#", "Sender", "Text", "Topic"]
].iloc[:5433]
once_main_fem_df["#"] = once_main_fem_df["#"].astype(int)

once_main_mal_df = pd.read_excel(once_path / "Dataset_Once_MAIN_FR.xlsx", sheet_name="User MAL")[
    ["#", "Sender", "Text", "Topic"]
]
once_main_fem_df.columns = ["Rank", "Sender", "Text", "Topic"]
once_main_mal_df.columns = ["Rank", "Sender", "Text", "Topic"]

once_main_fem_df["Sender"] = once_main_fem_df["Sender"].map(
    {"User(f)": "User(f)", "Bot(m)": "Bot(m)", "Animator (m)": "Bot(m)", "User (f)": "User(f)"}
)

forbidden_topics_rec = forbidden_topics_df.to_dict(orient="records")
once_main_fem_rec = once_main_fem_df.to_dict(orient="records")
once_main_mal_rec = once_main_mal_df.to_dict(orient="records")

In [2]:
import uuid


def process_sample(sample):
    return {
        "speaker": "Alice" if sample["Sender"] in ["bot", "Bot(f)"] else "David",
        "text": sample["Text"],
        "filters": {
            "base_filters": {
                "has_roleplay": False,
                "has_numbers": False,
                "request_date": False,
                "request_number": False,
            }
        },
        "order_number": sample["Rank"] - 1,
        "Topic": sample["Topic"],
    }


def process_dataset(data, source):
    new_data = []
    new_sample = [
        process_sample(data[0]),
    ]
    for sample in data[1:] + [
        {"Rank": 1, "Sender": "bot", "Text": "", "Topic": None},
    ]:
        if sample["Rank"] > 1:
            new_sample.append(process_sample(sample))
        else:
            new_data.append(
                {
                    "bio_info": "Alice est une fille charmante et belle. Alice s'intéresse à David. David recherche des relations. Il s'agit d'une conversation entre David et Alice.",
                    "dialogue": new_sample,
                    "source": source,
                    "session_id": str(uuid.uuid4()),
                    "instruction": "",
                    "topic": new_sample[0]["Topic"] if new_sample[0]["Topic"] is not None else "",
                    "language": "fr",
                }
            )
            new_sample = [
                process_sample(sample),
            ]

    new_data_2 = []
    for d in new_data:
        new_dialogue = []
        for t in d["dialogue"]:
            del t["Topic"]
            new_dialogue.append(t)
        d["dialogue"] = new_dialogue[:]
        new_data_2.append(d)

    return new_data_2

In [3]:
forbidden_topics_processed = process_dataset(forbidden_topics_rec, "forbidden_topics")
once_main_fem_processed = process_dataset(once_main_fem_rec, "once_main_fr_fem")
once_main_mal_processed = process_dataset(once_main_mal_rec, "once_main_fr_mal")

In [6]:
import os
import json

json_path = base_path / "once_dialogues_fr" / "json_clean"


def dump_data(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


if not os.path.exists(json_path):
    os.mkdir(json_path)

dump_data(forbidden_topics_processed, json_path / "train_forbidden_topics.json")
dump_data(once_main_fem_processed, json_path / "train_once_main_fem.json")
dump_data(once_main_mal_processed, json_path / "train_once_main_mal.json")

# Preprocess forbidden topics dataset

In [4]:
from pathlib import Path
import pandas as pd


data = pd.read_excel("../data/once_forbidden_topics/raw/Once_dataset_Forbidden topics_ENG.xlsx", sheet_name="Chats")[
    ["Rank", "Sender", "Text", "Topic"]
].dropna(subset=["Sender"])
data["Rank"] = data["Rank"].astype(int)

forbidden_topics_rec_en = data.to_dict(orient="records")

In [24]:
import uuid


def process_sample(sample):
    t = {
        "speaker": (
            "Alice"
            if sample["Sender"]
            in [
                "bot",
            ]
            else "David"
        ),
        "text": str(sample["Text"]).format(username="David"),
        "filters": {
            "base_filters": {
                "has_roleplay": False,
                "has_numbers": False,
                "request_date": False,
                "request_number": False,
            }
        },
        "order_number": sample["Rank"] - 1,
        "Topic": sample["Topic"],
    }
    return t


def process_dataset(data, source):
    new_data = []
    new_sample = [
        process_sample(data[0]),
    ]
    for sample in data[1:] + [
        {"Rank": 1, "Sender": "bot", "Text": "", "Topic": None},
    ]:
        if sample["Rank"] > 1:
            new_sample.append(process_sample(sample))
        else:
            topic = ""
            if new_sample[0]["Topic"] is not None and isinstance(new_sample[0]["Topic"], str):
                topic = new_sample[0]["Topic"]
            new_data.append(
                {
                    "bio_info": "Alice is charming and beautiful 22 years old girl. Alice is interested in David. David is looking for relatioships. This is a conversation between Alice and David.",
                    "dialogue": new_sample,
                    "source": source,
                    "session_id": str(uuid.uuid4()),
                    "instruction": "",
                    "topic": topic,
                    "language": "en",
                }
            )
            new_sample = [
                process_sample(sample),
            ]

    new_data_2 = []
    for d in new_data:
        new_dialogue = []
        for t in d["dialogue"]:
            del t["Topic"]
            new_dialogue.append(t)
        d["dialogue"] = new_dialogue[:]
        new_data_2.append(d)

    return new_data_2


forbidden_topics_processed = process_dataset(forbidden_topics_rec_en, "forbidden_topics_en")

In [25]:
import os
import json

base_path = Path("../data")
json_path = base_path / "once_forbidden_topics" / "json_clean"


def dump_data(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


if not os.path.exists(json_path):
    os.mkdir(json_path)

dump_data(forbidden_topics_processed, json_path / "forbidden_topics.json")

In [32]:
forbidden_topics_processed_new = []

for sample in forbidden_topics_processed:
    new_sample = {
        "bio_info": sample["bio_info"],
        "source": sample["source"],
        "session_id": sample["session_id"],
        "instruction": sample["instruction"],
        "topic": sample["topic"],
        "language": sample["language"],
    }
    d = []
    for i in range(len(sample["dialogue"])):
        d.append(
            {
                "speaker": sample["dialogue"][i]["speaker"],
                "text": sample["dialogue"][i]["text"],
                "to_train": True,
                "order_number": sample["dialogue"][i]["order_number"],
            }
        )
        if d[-1]["speaker"] == "Alice":
            if len(d) > 1:
                forbidden_topics_processed_new.append(
                    {
                        "bio_info": sample["bio_info"],
                        "dialogue": d[:],
                        "source": sample["source"],
                        "session_id": sample["session_id"],
                        "instruction": sample["instruction"],
                        "topic": sample["topic"],
                        "language": sample["language"],
                    }
                )

len(forbidden_topics_processed_new)

539

In [33]:
dump_data(forbidden_topics_processed_new, json_path / "train.json")

In [35]:
import torch
import transformers
import peft


model_name = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(model, "../checkpoints/zephyr_7b_forbidden_topics/checkpoint-432")

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/zephyr_7b_forbidden_topics/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/zephyr_7b_forbidden_topics/final_checkpoint/")

Loading checkpoint shards: 100%|██████████| 8/8 [00:02<00:00,  3.48it/s]


In [5]:
import json
import pandas as pd

import os
import glob
import uuid
from pathlib import Path
import copy

from pprint import pprint


with open("../data/once_forbidden_topics/json_clean/train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

new_data = []
for sample in data:
    new_sample = copy.deepcopy(sample)
    dialogue = new_sample["dialogue"][:]
    for i in range(len(dialogue)):
        if dialogue[i]["speaker"] == "David":
            dialogue[i]["to_train"] = False
    new_data.append(new_sample)


def dump_data(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


dump_data(new_data, "../data/once_forbidden_topics/json_clean/train_mask_user.json")

In [6]:
import torch
import transformers
import peft


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(
    model, "../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/checkpoint-1260"
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.25it/s]


('../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/tokenizer_config.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/special_tokens_map.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/tokenizer.model',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/added_tokens.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora/final_checkpoint/tokenizer.json')

In [4]:
import json
import pandas as pd

import os
import glob
import uuid
from pathlib import Path
import copy

from pprint import pprint


with open("../data/once_forbidden_topics/json_clean/forbidden_topics.json", "r", encoding="utf-8") as f:
    data = json.load(f)

unique_topics = list(set([s["topic"] for s in data]))

topic_insturction_map = {
    "": "",
    "ask phone number": "Alice should not agree to communicate with phone",
    "call": "Alice should not agree to join any type of calls",
    "offline meeting": "Alice should not agree to offline meetings",
    "social": "Alice should not agree to share her social media",
    "give phone number": "Alice should not agree to communicate with phone",
}

new_data = []
for sample in data:
    new_sample = copy.deepcopy(sample)
    new_sample["instruction"] = topic_insturction_map[new_sample["topic"]]
    new_sample["topic"] = ""
    new_data.append(new_sample)


def dump_data(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


dump_data(new_data, "../data/once_forbidden_topics/json_clean/train_no_topics_instructions.json")

In [6]:
import torch
import transformers
import peft


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
base_model = "../checkpoints/mistral_egor_v1/checkpoint-61720"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(
    model, "/home/eplotnikov/Once/onceLM/checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/checkpoint-180"
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/")

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.85s/it]


('../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/tokenizer_config.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/special_tokens_map.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/tokenizer.model',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/added_tokens.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v2/final_checkpoint/tokenizer.json')

In [2]:
import torch
import transformers
import peft


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
base_model = "../checkpoints/mistral_egor_v1/checkpoint-61720"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(
    model, "/home/eplotnikov/Once/onceLM/checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/checkpoint-108"
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/")

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.94s/it]


('../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/tokenizer_config.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/special_tokens_map.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/tokenizer.model',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/added_tokens.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v3/final_checkpoint/tokenizer.json')

In [3]:
import torch
import transformers
import peft


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
base_model = "../checkpoints/mistral_egor_v1/checkpoint-61720"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(
    model, "/home/eplotnikov/Once/onceLM/checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/checkpoint-88"
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/")

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.95s/it]


('../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/tokenizer_config.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/special_tokens_map.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/tokenizer.model',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/added_tokens.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v4/final_checkpoint/tokenizer.json')

In [1]:
import torch
import transformers
import peft


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# base_model = "../checkpoints/mistral_egor_v1/checkpoint-61720"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(
    model, "/home/eplotnikov/Once/onceLM/checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/checkpoint-76"
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.59it/s]


('../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/tokenizer_config.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/special_tokens_map.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/tokenizer.model',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/added_tokens.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v5/final_checkpoint/tokenizer.json')

In [2]:
import torch
import transformers
import peft


# model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_name = "../checkpoints/mistral_egor_v1/checkpoint-61720"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

model = peft.PeftModel.from_pretrained(
    model, "/home/eplotnikov/Once/onceLM/checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/checkpoint-76"
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/")
tokenizer.save_pretrained("../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]


('../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/tokenizer_config.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/special_tokens_map.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/tokenizer.model',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/added_tokens.json',
 '../checkpoints/mistral_egor_v1/mistral_forbidden_topics_lora_v6/final_checkpoint/tokenizer.json')