<a href="https://colab.research.google.com/github/Sciform/sciform-hwz-ai-in-controlling/blob/main/Conversational_health_bot_pretrained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# We build an GPT-2 based conversational health bot

We use a pretrained GPT-2 by OpenAI and provided through HuggingFace and fine-tune it for a Kaggle data set of mental health conversations.

In [1]:
!pip install transformers[torch]

Collecting transformers[torch]
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m3

# Load conversational health data

In [2]:
# Let's download the a conversational health data set
# (Originally from Kaggle: https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data)

!wget https://raw.githubusercontent.com/sciform/sciform-hwz-ai-in-controlling/main/data/conversational_health_intents.json

--2023-08-23 13:40:48--  https://raw.githubusercontent.com/sciform/sciform-hwz-ai-in-controlling/main/data/conversational_health_intents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39353 (38K) [text/plain]
Saving to: ‘conversational_health_intents.json’


2023-08-23 13:40:49 (17.0 MB/s) - ‘conversational_health_intents.json’ saved [39353/39353]



# Preprocess raw data

In [3]:
import json

def preprocess_intents_json(intents_file):
    with open(intents_file, "r") as f:
        data = json.load(f)

    preprocessed_data = []

    # save in .txt-file with format
    # "User: <Question>"
    # "Assistant: <Answer>"
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            preprocessed_data.append(f"User: {pattern}\n")
            for response in intent["responses"]:
                preprocessed_data.append(f"Assistant: {response}\n")

    return "".join(preprocessed_data)

def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w") as f:
        f.write(preprocessed_data)


intents_json_file = "conversational_health_intents.json"
output_txt_file = "conversational_health_data.txt"

data = preprocess_intents_json(intents_json_file)
save_preprocessed_data(data, output_txt_file)

# Create training and validation data

In [4]:
import numpy as np

# Read the entire dataset into a list
with open(output_txt_file, 'r') as f:
    data = f.readlines()

# Randomly shuffle the dataset
np.random.seed(1)
np.random.shuffle(data)

# Split the dataset into training and validation sets (80% - 20%)
split_index = int(len(data) * 0.8)
train_data = data[:split_index]
val_data = data[split_index:]

# Save the training and validation sets as separate files
with open('train_data.txt', 'w') as f:
    f.writelines(train_data)

with open('validation_data.txt', 'w') as f:
    f.writelines(val_data)

# Train and evaluate the model

Use GPU, if possible.

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, EvalPrediction
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import numpy as np
from scipy.special import softmax
from sklearn.metrics import log_loss


# compute perplexity metrics
def compute_metrics(p: EvalPrediction):
    logits = p.predictions
    labels = p.label_ids
    probabilities = softmax(logits, axis=-1)
    loss = log_loss(labels.flatten(), probabilities.reshape(-1, probabilities.shape[-1]), labels=[i for i in range(logits.shape[-1])])
    perplexity = np.exp(loss)
    return {"perplexity": perplexity}


def fine_tune_gpt2(model_name, train_file, validation_file, output_dir):

    # Load GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Load GPT-2 model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Load training dataset
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    # Load validation dataset
    val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=validation_file,
        block_size=128)

    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    # Set training arguments
    training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    save_total_limit=2,
)

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# Fine-tune the model
fine_tune_gpt2("gpt2", "train_data.txt", "validation_data.txt", "output")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Perplexity
1,No log,2.327704,38049.86024
2,No log,2.132983,56967.855561
3,No log,2.093661,69787.930502
