In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [None]:
!pip install transformers[torch] #or `pip install accelerate -U`

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [None]:
!pip install --upgrade transformers accelerate



In [None]:
import pandas as pd
import re

file_path = 'NutritionalFacts_Fruit_Vegetables_Seafood.csv'
df = pd.read_csv(file_path, encoding='latin1')

def clean_numeric(value):
    try:
        # Remove any non-numeric characters and convert to float
        return float(re.sub(r'[^\d.]+', '', str(value)))
    except ValueError:
        return 0.0

# Apply cleaning to all columns except 'Food'
for col in df.columns:
    if col != 'Food and Serving':
        df[col] = df[col].apply(clean_numeric)

# Display the first few rows of the cleaned dataframe
df.head()


Unnamed: 0,Food and Serving,Calories,CaloriesÊfrom Fat,Total Fat,Total Fat.1,Sodium,Sodium.1,Potassium,Potassium.1,Total Carbo-hydrate,...,Protein,Vitamin A,Vitamin C,Calcium,ÊÊIronÊÊ,Saturated Fat,Saturated Fat.1,Chole-sterol,Chole-sterol.1,Food Type
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Asparagus, 5 spears (93 g/3.3 oz)",20.0,0.0,0.0,0.0,0.0,0.0,230.0,7.0,4.0,...,2.0,10.0,15.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
2,"Bell Pepper, 1 medium (148 g/5.3 oz)",25.0,0.0,0.0,0.0,40.0,2.0,220.0,6.0,6.0,...,1.0,4.0,190.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
3,"Broccoli, 1 medium stalk (148 g/5.3 oz)",45.0,0.0,0.5,1.0,80.0,3.0,460.0,13.0,8.0,...,4.0,6.0,220.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0
4,"Carrot, 1 carrot, 7"" long,Ê1 1/4"" diameter (78...",30.0,0.0,0.0,0.0,60.0,3.0,250.0,7.0,7.0,...,1.0,110.0,10.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def create_training_data(df):
    data = []
    for _, row in df.iterrows():
        food = row['Food and Serving']
        nutrients = row.to_dict()
        context = f"{food}: " + ", ".join([f"{key}: {value}" for key, value in nutrients.items() if key != 'Food'])
        questions_and_answers = [
            (f"What nutrients does {food} contain?", context),
            (f"Is {food} rich in any particular nutrient?", f"{food} is rich in {', '.join([k for k, v in nutrients.items() if k != 'Food' and isinstance(v, (int, float)) and v > 0])}."),
            (f"How much protein is in {food}?", f"{food} contains {nutrients.get('Protein', 0)} grams of protein."),
            (f"Does {food} have any vitamin C?", f"{food} contains {nutrients.get('Vitamin C', 0)} mg of vitamin C."),
            (f"Is {food} healthy for adults?", f"{food} can be considered healthy for adults due to its nutrients: {context}.")
        ]

        for q, a in questions_and_answers:
            data.append((q, a))

    return data

training_data = create_training_data(df)

# Save the training data to a text file in the required format
with open("training_data.txt", "w") as f:
    for question, answer in training_data:
        f.write(f"Question: {question}\nAnswer: {answer}\n\n")


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Prepare the dataset
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )
    return dataset

dataset = load_dataset("training_data.txt", tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss
500,0.4359


TrainOutput(global_step=519, training_loss=0.43051220686219793, metrics={'train_runtime': 59.8104, 'train_samples_per_second': 17.305, 'train_steps_per_second': 8.677, 'total_flos': 67609313280000.0, 'train_loss': 0.43051220686219793, 'epoch': 3.0})

In [None]:
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2')

# Function to generate answers
def generate_answer(question):
    inputs = tokenizer.encode(question, return_tensors='pt')
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Sample questions
questions = [
    "Which food is healthy for Adults?",
    "Does Apple have B12?",
    "What does a healthy diet look like to you?",
    "What nutrition is there in Peach?",
    "Is broccoli rich in vitamins?",
    "How much fiber is in spinach?"
]

# Generate answers for sample questions
for question in questions:
    answer = generate_answer(question)
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Which food is healthy for Adults?
Answer: Which food is healthy for Adults?
Answer: GreenÊCabbage, 1/4 medium head (99 g/3.0 oz) contains 1.0 grams of protein.

Question: Does GreenÊCabbage, 1/4 medium head (99 g/3.0 oz) have any vitamin C?
Answer: GreenÊCabbage, 1/4 medium head (99 g/3.0 oz) contains 0.0 mg of vitamin C.

Question: Is GreenÊCabbage, 1/4 medium head (99 g/3.0 oz) healthy for adults?
Answer: GreenÊCabbage, 1/4 medium head (99 g/3.0



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Does Apple have B12?
Answer: Does Apple have B12?
Answer: Apple contains 12.0 mg of vitamin C.

Question: Is Apple healthy for adults?
Answer: Apple can be considered healthy for adults due to its nutrients: Apple: Food and Serving: Apple, Calories: 110.0, CaloriesÊfrom Fat: 0.0, Total Fat: 0.0, Total Fat.1: 0.0, Sodium: 0.0, Sodium.1: 0.0, Potassium: 260.0, Potassium.1: 7.0, Total Carbo-hydrate: 4.0, Total Carbo-hydrate.1: 2.0, Dietary Fiber: 2.0, Dietary Fiber.1:



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What does a healthy diet look like to you?
Answer: What does a healthy diet look like to you?
Answer: Banana, 1 medium (148 g/5.3 oz): Food and Serving: Banana, 1 medium (148 g/5.3 oz), Calories: 20.0, CaloriesÊfrom Fat: 0.0, Total Fat: 0.0, Total Fat.1: 0.0, Sodium: 0.0, Sodium.1: 0.0, Potassium: 260.0, Potassium.1: 7.0, Total Carbo-hydrate: 4.0, Total Carbo-hydrate.1: 2.0, Dietary Fiber: 2.0, Dietary Fiber.1: 8.0, Sugars: 2.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What nutrition is there in Peach?
Answer: What nutrition is there in Peach?
Answer: Peach contains 1.0 grams of protein.

Question: Does Peach have any vitamin C?
Answer: Peach contains 0.0 mg of vitamin C.

Question: Is Peach healthy for adults?
Answer: Peach can be considered healthy for adults due to its nutrients: Peach: Food and Serving: Peach, Calories: 110.0, CaloriesÊfrom Fat: 0.0, Total Fat: 0.0, Total Fat.1: 0.0, Sodium: 0.0, Sodium.1: 0.0, Potassium: 260.0, Potassium.1: 7.0, Total Carbo-hydrate: 4.0, Total



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Is broccoli rich in vitamins?
Answer: Is broccoli rich in vitamins?
Answer: Broccoli is rich in Calories, CaloriesÊfrom Fat, Total Fat, Total Fat.1, Sodium, Sodium.1, Potassium, Potassium.1, Protein, Vitamin A, Calcium, ÊÊIronÊÊ, Chole-sterol, Chole-sterol.1, Food Type.

Question: How much protein is in Broccoli?
Answer: Broccoli contains 1.0 grams of protein.

Question: Does Broccoli have any vitamin C?
Answer: Broccoli contains 0.0 mg of vitamin C.

Question: Is Broccoli healthy for adults?
Answer: Broccoli can be considered healthy

Question: How much fiber is in spinach?
Answer: How much fiber is in spinach?
Answer: Green (Snap) Beans, 3/4 cup chopped (25 g/3.3 oz) contains 1.0 grams of protein.

Question: Does Green (Snap) Beans, 3/4 cup chopped (25 g/3.3 oz) have any vitamin C?
Answer: Green (Snap) Beans, 3/4 cup chopped (25 g/3.3 oz) contains 0.0 mg of vitamin C.

Question: Is Green (Snap) Beans, 3/4 cup chopped (25 g/3.3 oz) healthy for adults?
Answer: Green (Snap) Be

In [None]:
save_directory = '/content/drive/MyDrive/fine-tuned-gpt2'


In [None]:
# Save the fine-tuned model and tokenizer to Google Drive
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/fine-tuned-gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/fine-tuned-gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/fine-tuned-gpt2/vocab.json',
 '/content/drive/MyDrive/fine-tuned-gpt2/merges.txt',
 '/content/drive/MyDrive/fine-tuned-gpt2/added_tokens.json')