### Step 1: Convert intents.json into a Training Dataset

In [20]:
import json
import random
import pandas as pd

# Load intents.json
with open("intents.json", "r") as f:
    data = json.load(f)

# Create a list of (input, response) pairs
conversations = []
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        for response in intent["responses"]:
            conversations.append({"text": f"User: {pattern}", "response": f"AI: {response}"})

# Shuffle data to mix different categories
random.shuffle(conversations)

# Convert to DataFrame
df = pd.DataFrame(conversations)

# Split into Train (80%) and Test (20%)
train_size = int(0.8 * len(df))
train_df = df[:train_size]
test_df = df[train_size:]

# Save to CSV or JSON for easier processing
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("Dataset structured successfully! Training and Testing sets created.")

Dataset structured successfully! Training and Testing sets created.


In [24]:
print(train_df)

                             text  \
0                       User: yes   
1                       User: Ola   
2        User: I am so burned out   
3               User: I feel down   
4                  User: Bye then   
..                            ...   
523   User: I still feel stressed   
524                  User: family   
525         User: I am so useless   
526  User: Nobody understands me.   
527                    User: yeah   

                                              response  
0                       AI: Can you elaborate on that?  
1             AI: Hi there. How are you feeling today?  
2    AI: I am sorry to hear that. What is the reaso...  
3              AI: Why do you think you feel this way?  
4                                 AI: Have a nice day.  
..                                                 ...  
523  AI: I am sorry to hear that. What is the reaso...  
524                              AI: I see. What else?  
525  AI: i first want to let you know that

### Step 2: Tokenize the Dataset

In [1]:
%pip install transformers datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [38]:
from transformers import AutoTokenizer
import pandas as pd
from datasets import Dataset

# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    inputs = tokenizer(
        [" ".join(pair) for pair in zip(examples["text"], examples["response"])],
        truncation=True,
        padding="max_length",
        max_length=32
    )
    inputs["labels"] = inputs["input_ids"].copy()  # GPT-2 needs labels to compute loss
    return inputs

# Load train and test data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Convert to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Check tokenized data
print(tokenized_train[0])

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

{'text': 'User: yes', 'response': 'AI: Can you elaborate on that?', 'input_ids': [12982, 25, 3763, 9552, 25, 1680, 345, 15962, 319, 326, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [12982, 25, 3763, 9552, 25, 1680, 345, 15962, 319, 326, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]}


### Step 3: Convert Dataset for Training

In [6]:
%pip install torch



Convert to PyTorch/TensorFlow Dataset

In [39]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling since this is causal LM
)

train_dataloader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_test, batch_size=8, shuffle=False, collate_fn=data_collator)

Load a GPT-2 Model

In [40]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")


### Step 4: Fine-tune GPT-2

Define Training Arguments

In [41]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-mental-health",  # Save model checkpoints
    evaluation_strategy="epoch",        # Evaluate after each epoch
    save_strategy="epoch",              # Save after each epoch
    logging_dir="./logs",               # Logs for tracking progress
    per_device_train_batch_size=8,      # Adjust batch size based on GPU
    per_device_eval_batch_size=8,
    num_train_epochs=5,                 # Number of epochs (adjust as needed)
    weight_decay=0.01,                   # Regularization
    logging_steps=10,
    save_total_limit=2,                  # Keep only last 2 checkpoints
    push_to_hub=False
)




Create Trainer

In [42]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)


  trainer = Trainer(


Start Training

In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.1932,1.024824
2,0.7532,0.787048
3,0.752,0.69268
4,0.6018,0.657955
5,0.5192,0.640254


TrainOutput(global_step=330, training_loss=0.8551688555515173, metrics={'train_runtime': 268.1859, 'train_samples_per_second': 9.844, 'train_steps_per_second': 1.23, 'total_flos': 43113185280000.0, 'train_loss': 0.8551688555515173, 'epoch': 5.0})

Save and Load the Model

In [44]:
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.json',
 'fine_tuned_model/merges.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')

Test the Model: Generate responses from your fine-tuned chatbot using

In [47]:
from transformers import pipeline

chat_model = pipeline(
    "text-generation",
    model="fine_tuned_model",
    tokenizer="fine_tuned_model",
    truncation=True  # Add this to avoid warnings
)

input_texts = [
    "User: I feel so alone.",
    "User: Nothing feels right anymore.",
    "User: I'm really stressed out about my job."
]
for text in input_texts:
    output = chat_model(text, max_length=50)
    print(output[0]['generated_text'])

Device set to use cuda:0


User: I feel so alone. AI: Oh ok what brings you today?
User: Nothing feels right anymore. AI: I'm sorry to hear that. I'm trying my best to help
User: I'm really stressed out about my job. AI: Ola, you seem really stressed out. What do you think is behind this?
