# Dataset preparation

In [None]:
import json
import pandas as pd

with open("tags_preparing/parsed_tags/tags_charcterized.json", "r") as f:
    tags_characterized = json.load(f)

In [2]:
print(tags_characterized.keys())

dict_keys(['Technology & Engineering', 'Data & Analytics', 'Business & Management', 'IT & Computing', 'Science & Research', 'Healthcare & Life Sciences', 'Creative & Design', 'Professional Services', 'Industrial & Manufacturing', 'Personal & Soft Skills', 'Tools & Platforms', 'Other Specializations'])


## We make characteristics/bio of "people" based on tags. Also some tag categories will be selected more frequently, other less (based on weights)

In [None]:
# categories and their respective weights
category_weights = {
    "IT & Computing": 5,
    "Technology & Engineering": 4, "Data & Analytics": 4,
    "Personal & Soft Skills": 3, "Other Specializations": 3, "Creative & Design": 3,
    "Business & Management": 2, "Science & Research": 2, "Tools & Platforms": 2,
    "Healthcare & Life Sciences": 1, "Professional Services": 1, "Industrial & Manufacturing": 1
}

# categories and limits for selecting tags
tag_limits = {
    "Technology & Engineering": (1, 3),
    "Business & Management": (1, 3),
    "Data & Analytics": (3, 5),
    "IT & Computing": (3, 6),
    "Science & Research": (1, 1),
    "Healthcare & Life Sciences": (1, 1),
    "Professional Services": (1, 1),
    "Industrial & Manufacturing": (1, 1),
    "Creative & Design": (1, 2),
    "Personal & Soft Skills": (1, 2),
    "Tools & Platforms": (1, 2),
    "Other Specializations": (1, 2)
}
dataset_pref = pd.read_csv("sinthetic_dataset.csv")

dataset_size_left = 50000 - dataset_pref.shape[0]
print(dataset_size_left)



50000


## Model

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def load_model():
    model_name = "microsoft/phi-2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

## Characteristics based on selected tags

In [22]:
def generate_characteristics(generator, tags):
    prompt = f"Imagine you are a career consultant writing profiles. Generate a professional bio for someone with expertise in these areas: {', '.join(tags)}.\nInclude: 1) Summary, 2) Key Skills, and 3) Achievements. \nDo not include any introductory or explanatory text, only the profile itself. \nOutput:"
    response = generator(
        prompt,
        max_length=512,
        do_sample=True,
        temperature=0.6,
    )
    return response[0]['generated_text'].split("Output:")[-1].strip()

## Tags generation
Number of categories per person will be normally ditributed

In [13]:
import random
def generate_tags():
    num_categories = min(5, max(1, int(random.gauss(3, 1))))
    categories = random.choices(list(category_weights.keys()),
                                    weights=category_weights.values(),
                                    k=num_categories)
    categories = list(set(categories))
    tags = []
    for category in categories:
        min_tags, max_tags = tag_limits[category]
        num_tags = random.randint(min_tags, max_tags)
        category_tags = random.sample(tags_characterized[category], num_tags)
        tags.extend(category_tags)
    return list(set(tags))
    

In [None]:
def save_to_csv(data):
    df = pd.DataFrame(data)
    df.to_csv("synthetic_dataset.csv", index=False)

In [None]:
def make_synthetic_dataset(generator, entries_count=1000):
    data = []
    for i in range(0, entries_count):
        tags = generate_tags()
        characteristics = generate_characteristics(generator, tags)
        data.append({
            "characteristics": characteristics,
            "tags": tags
        })
        if i % 1000 == 0:
            print(f"Generated {i + 1} entries so far...")
            if i == 0:
                print(data)
        if i % 100 == 0 and i != 0:
            save_to_csv(data)
    save_to_csv(data)
    

In [23]:
generator = load_model()

Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.14s/it]
Device set to use cpu


## Dataset making

very time consuming on CPU (up to 4-5 minutes per dataset object), so will be better to run on GPU

In [24]:

dataset = make_synthetic_dataset(generator, entries_count=dataset_size_left)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/

Generated 0 entries so far...
[{'characteristics': "1) Summary:\nThis expert has extensive knowledge and experience in various data-related fields, including Hadoop, Data Migration, NLP, Topic Modeling, Web Scraping, and Root Cause analysis. They have a proven track record of successfully implementing complex data solutions and have worked with a wide range of clients across different industries.\n2) Key Skills:\n- Proficient in Hadoop, Data Migration, NLP, Topic Modeling, Web Scraping, and Root Cause analysis\n- Experienced in developing and implementing data solutions\n- Strong problem-solving skills and ability to work under pressure\n- Excellent communication and collaboration skills\n- Familiar with various programming languages and tools\n3) Achievements:\n- Successfully implemented a complex data solution for a major client, resulting in a 50% increase in revenue\n- Developed and implemented a data migration strategy that saved a client time and money\n- Conducted root cause ana

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_tok

KeyboardInterrupt: 