#Setting up the envioronment!

In [None]:
!pip install -q -U bitsandbytes
!pip install datasets transformers
!pip install -U autotrain-advanced
!pip install deepspeed

In [None]:
import pandas as pd
import os
import json
import numpy as np
import torch

#Load the data!

In [None]:
df = pd.read_json('entities1000.jsonl')
df.head()

Unnamed: 0,question,answer,entities
0,What is the primary seminary of the Congregati...,Moreau Seminary,[Organization: Congregation of the Holy Cross]
1,What is the oldest structure at Notre Dame?,The Log Chapel,[Organization: Notre Dame]
2,What individuals live at Fatima House at Notre...,Residential Students,"[Location: Fatima House, Organization: Notre D..."
3,Which prize did Frederick Buechner create?,Princeton University's Laureate Award for Writing,[Name: Frederick Buechner]
4,How many BS level degrees are offered in the C...,8,"[Degree: College of Engineering, Field of Stud..."


#Process the data!

In [None]:
def transform_info(entities):
  """
    Create a column of answers of the entities. It would now be in proper JSON format within a list[].

    Parameters:
    The `entities` column.

    Returns:
    Columns of lists of entities' category.
    """
    transformed_list = []
    for item in entities:
        key, value = item.split(": ", 1)
        transformed_list.append({key: value})
    return transformed_list


# Apply the function to the 'entities' column
df['entities'] = df['entities'].apply(transform_info)

In [None]:
def json_output(df):
    """
    Create a column of JSON-like strings that should be the main part of the model's response

    Parameters:
    df (pd.DataFrame): The input transposed DataFrame.

    Returns:
    pd.DataFrame: The transformed DataFrame with one column of JSON-like strings in a new dataframe.
    """
    # Initialize an empty list to collect JSON-like strings
    data = []

    # Iterate over each row and gather its value
    for index, row in df.iterrows():
        # Convert the row to a dictionary and then to a JSON string
        json_str = json.dumps(row.to_dict(), indent = 4)
        data.append(json_str)

    # Create a new DataFrame with one column
    new_df = pd.DataFrame(data, columns=['json_output'])

    return new_df

# Transform the DataFrame
df['json_output'] = json_ouput(df)

In [None]:
# Function to replace entities
def extract_keys(info):
    """
    Create a column of the entities' category that would be used in the instructions.

    Parameters:
    The `entities` column.

    Returns:
    pd.DataFrame: The transformed DataFrame with one column of the entities' category.
    """
    keys = [list(d.keys())[0] for d in info]
    return ", ".join(keys).lower()

# Apply the function to the 'entity_extraction' column
df['entity_extraction'] = df['entities'].apply(extract_keys)
df['entity_extraction'].head()

0                            organization
1                            organization
2                  location, organization
3                                    name
4    degree, field of study, organization
Name: entity_extraction, dtype: object

In [None]:
df

Unnamed: 0,question,answer,entities,json_output,entity_extraction
0,What is the primary seminary of the Congregati...,Moreau Seminary,[{'Organization': 'Congregation of the Holy Cr...,"{\n ""question"": ""What is the primary semina...",organization
1,What is the oldest structure at Notre Dame?,The Log Chapel,[{'Organization': 'Notre Dame'}],"{\n ""question"": ""What is the oldest structu...",organization
2,What individuals live at Fatima House at Notre...,Residential Students,"[{'Location': 'Fatima House'}, {'Organization'...","{\n ""question"": ""What individuals live at F...","location, organization"
3,Which prize did Frederick Buechner create?,Princeton University's Laureate Award for Writing,[{'Name': 'Frederick Buechner'}],"{\n ""question"": ""Which prize did Frederick ...",name
4,How many BS level degrees are offered in the C...,8,"[{'Degree': 'College of Engineering'}, {'Field...","{\n ""question"": ""How many BS level degrees ...","degree, field of study, organization"
...,...,...,...,...,...
1046,Who is the author of 'Frankenstein'?,Mary Shelley.,"[{'Work': 'Frankenstein'}, {'Author': 'Mary Sh...",,"work, author"
1047,What is the capital city of Australia?,Canberra.,"[{'Location': 'Australia'}, {'Feature': 'Capit...",,"location, feature"
1048,Who painted 'Girl with a Pearl Earring'?,Johannes Vermeer.,"[{'Work': 'Girl with a Pearl Earring'}, {'Arti...",,"work, artist"
1049,What is the chemical symbol for potassium?,K.,"[{'Element': 'Potassium'}, {'Symbol': 'K'}]",,"element, symbol"


The data has a gap between its index and the row number because I manually deleted rows within the dataset. This causes some rows to have NaN values because of the `json_ouput` function.

In [None]:
df = df[:1031] #delete the rows with NaN `json_output`

In [None]:
# Formatting according to AutoTrain requirements
def format_interaction(row):
   formatted_text = f"<s>[INST]{row['question']} Answer the question, extract the {row['entity_extraction']} and return in Json format.[/INST]```json\n{row['json_output']}\n```</s>"
   return formatted_text


In [None]:
df['formatted_data'] = df.apply(format_interaction, axis=1)
df['formatted_data']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['formatted_data'] = df.apply(format_interaction, axis=1)


0       <s>[INST]What is the primary seminary of the C...
1       <s>[INST]What is the oldest structure at Notre...
2       <s>[INST]What individuals live at Fatima House...
3       <s>[INST]Which prize did Frederick Buechner cr...
4       <s>[INST]How many BS level degrees are offered...
                              ...                        
1036    <s>[INST]Who wrote 'The Grapes of Wrath'? Answ...
1037    <s>[INST]What is the longest river in the Midd...
1038    <s>[INST]Who is the author of 'Wuthering Heigh...
1039    <s>[INST]What is the capital city of Italy? An...
1040    <s>[INST]Who painted 'The Girl with a Pearl Ea...
Name: formatted_data, Length: 1031, dtype: object

In [None]:
df['formatted_data'][1030] #checking the last row...

'<s>[INST]Who is the author of \'The Divine Comedy\'? Answer the question, extract the work, author and return in Json format.[/INST]```json\n{\n    "question": "Who painted \'The Girl with a Pearl Earring\'?",\n    "answer": "Johannes Vermeer.",\n    "entities": [\n        {\n            "Work": "The Girl with a Pearl Earring"\n        },\n        {\n            "Artist": "Johannes Vermeer"\n        }\n    ]\n}\n```</s>'

The output would look like this:
````
<s>[INST]Who is the author of 'The Divine Comedy'? Answer the question, extract the work, author and return in Json format.[/INST]```json
{
    "question": "Who painted 'The Girl with a Pearl Earring'?",
    "answer": "Johannes Vermeer.",
    "entities": [
        {
            "Work": "The Girl with a Pearl Earring"
        },
        {
            "Artist": "Johannes Vermeer"
        }
    ]
}
```</s>
````

#Save the data!

In [None]:
# Process and save the dataset
if __name__ == "__main__":
    # Save the dataset
    save_path = 'formatted_data/training_dataset'
    os.makedirs(save_path, exist_ok=True)
    file_path = os.path.join(save_path, 'formatted_train.csv')
    df[['formatted_data']].to_csv(file_path, index=False)
    print("Dataset formatted and saved.")

Dataset formatted and saved.


#Setup the training variables!
Make sure to change your preferred base model, your token, your project name, and the training variables.

In [None]:
#login huggingface
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()


In [None]:
!autotrain setup

[1mINFO    [0m | [32m2024-06-22 11:10:40[0m | [36mautotrain.cli.run_setup[0m:[36mrun[0m:[36m43[0m - [1mInstalling latest xformers[0m
[1mINFO    [0m | [32m2024-06-22 11:10:40[0m | [36mautotrain.cli.run_setup[0m:[36mrun[0m:[36m45[0m - [1mSuccessfully installed latest xformers[0m


In [None]:
username = 'Pennlaine'
model_name = 'mistral-community/Mistral-7B-v0.2'
push_to_hub = True
hf_token = '...'
project_name = 'Llama-8B-3-Entity-Extraction'

In [None]:
learning_rate = 1e-4
num_epochs = 3
batch_size = 4
block_size = 512
warmup_ratio = 0.05
weight_decay = 0.005
lora_r = 8
lora_alpha = 16
lora_dropout = 0.01

In [None]:
os.environ["USER_NAME"] = username
os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
os.environ["HF_TOKEN"] = hf_token

In [None]:
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)

In [None]:
os.environ["USE_FP16"] = "True"
os.environ["USE_PEFT"] = "True"
os.environ["USE_INT4"] = "True"

#Start training your model!

In [None]:
!autotrain llm \
 --train \
 --model "${MODEL_NAME}" \
 --project-name "${PROJECT_NAME}" \
 --data-path "formatted_data/training_dataset" \
 --text-column "formatted_data" \
 --lr "${LEARNING_RATE}" \
 --batch-size "${BATCH_SIZE}" \
 --epochs "${NUM_EPOCHS}" \
 --block-size "${BLOCK_SIZE}" \
 --warmup-ratio "${WARMUP_RATIO}" \
 --lora-r "${LORA_R}" \
 --lora-alpha "${LORA_ALPHA}" \
 --lora-dropout "${LORA_DROPOUT}" \
 --weight-decay "${WEIGHT_DECAY}" \
 --username "${USER_NAME}" \
 --token "${HF_TOKEN}" \
 --push-to-hub \
 $( [[ "$USE_FP16" == "True" ]] && echo "--mixed-precision fp16" ) \
 $( [[ "$USE_PEFT" == "True" ]] && echo "--use-peft" ) \
 $( [[ "$USE_INT4" == "True" ]] && echo "--quantization int4" )

#Test the results!

In [None]:
# Function to provide an instruction
def ask(model, tokenizer, question, max_new_tokens=1256):
   inputs = tokenizer.encode(question, return_tensors='pt')
   outputs = model.generate(inputs, max_new_tokens=max_new_tokens, num_return_sequences=1)
   answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
   return answer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "Pennlaine/Mistral-7B-v02-Entity-Extraction"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

In [None]:
question = '''[INST]Type 2 Diabetes Mellitus is a chronic metabolic disorder characterized by insulin resistance and relative insulin deficiency. This condition leads to chronic hyperglycemia, which can cause significant damage to various body systems over time. Write a short summary of how to treat patients with diabetes. Answer the question, extract the disorder, type of disorder, causes, effect, ICD Code, and return in Json format.[/INST]```json'''

response = ask(model, tokenizer, question)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST]Type 2 Diabetes Mellitus is a chronic metabolic disorder characterized by insulin resistance and relative insulin deficiency. This condition leads to chronic hyperglycemia, which can cause significant damage to various body systems over time. Write a short summary of how to treat patients with diabetes. Answer the question, extract the disorder, type of disorder, causes, effect, ICD Code, and return in Json format.[/INST]```json
{
    "question": "Type 2 Diabetes Mellitus is a chronic metabolic disorder characterized by insulin resistance and relative insulin deficiency. This condition leads to chronic hyperglycemia, which can cause significant damage to various body systems over time. Write a short summary of how to treat patients with diabetes.",
    "answer": "Treatment for diabetes typically involves a combination of lifestyle modifications, medication, and insulin therapy.",
    "entities": [
        {
            "Disorder": "Type 2 Diabetes Mellitus"
        },
        {
 

#Do evaluation!

In [None]:
!pip install bert_score

In [None]:
# prompt: use bert score to evaluate my model with the dataset because the model generates open ended responses

from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, token = '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')
model = AutoModelForCausalLM.from_pretrained(model_path, token = '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')


# Load the dataset
dataset = pd.read_json('eval_data.json')

# Function to generate outputs from the model
def generate_outputs(prompt):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(**inputs, max_length=512, num_return_sequences=1)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Lists to store generated and reference texts
generated_texts = []
reference_texts = []

# Iterate over the dataset and generate outputs
for index, row in dataset.iterrows():
    prompt = row['instruction']
    reference_text = row['text']

    # Generate output based on the prompt
    generated_text = generate_outputs(prompt)

    # Store generated and reference texts
    generated_texts.append(generated_text)
    reference_texts.append(reference_text)

# Calculate BERTScore
P, R, F1 = score(generated_texts, reference_texts, lang='en', verbose=True)

# Print BERTScore results
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1 score: {F1.mean().item():.4f}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 3.24 seconds, 15.45 sentences/sec
Precision: 0.9795
Recall: 0.9830
F1 score: 0.9813


Great! The score is pretty high, meaning the model is quite accurate! Congratulations! 😃