In [8]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [None]:
data_sample = load_dataset("HazSylvia/Fitness_Unformatted")

In [None]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Human', 'Assistant'],
        num_rows: 928
    })
})

In [None]:
# Convert to a pandas dataframe
updated_data = [{'HUman': item['Human'], 'Assistant': item['Assistant']} for item in data_sample['train']]
df = pd.DataFrame(updated_data)

In [None]:
df.head(5)

# Just extract the Symptoms
df['Assistant'] = df['Assistant'].apply(lambda x: ', '.join(x.split(', ')))
display(df.head())

Unnamed: 0,HUman,Assistant
0,can you recommend effective ab exercises,planks bicycle crunches and leg raises are gre...
1,what are some effective strategies for managin...,make time for relaxation take time to catch y...
2,how can i incorporate regular movement and phy...,take the stairs instead of the elevator whene...
3,are there any specific strategies for maintain...,eat a healthy diet eating a healthy diet rich...
4,how can i manage stress and maintain a healthy...,manage your time wisely prioritize your tasks...


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split


In [None]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

device

device(type='cuda')

In [None]:
# The tokenizer turns texts to numbers (and vice-versa)
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

# The transformer
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
model

# Model params
BATCH_SIZE = 8

df.describe()

Unnamed: 0,HUman,Assistant
count,928,928
unique,720,719
top,how can i create a supportive and positive soc...,the cable seated row is a popular exercise to ...
freq,6,4


In [None]:
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)  # Fix here
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        return max_length

In [None]:
data_sample = LanguageDataset(df, tokenizer)

data_sample

<__main__.LanguageDataset at 0x7a7da38e09a0>

In [None]:
# Create train, valid
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size
train_data, valid_data = random_split(data_sample, [train_size, valid_size])


In [None]:
# Make the iterators
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

In [None]:
num_epochs = 10

# Training parameters
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

In [None]:
## CrossEntropyLoss measures how close answers to the truth.
## More punishing for high confidence wrong answers
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Init a results dataframe
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
                                'training_loss', 'validation_loss', 'epoch_duration_sec'])


In [None]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

model.save_pretrained("/content/drive/MyDrive/Fine Tuning/fitnessmodel")
tokenizer.save_pretrained("/content/drive/MyDrive/Fine Tuning/fitnessmodel")

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


('/content/drive/MyDrive/Fine Tuning/fitnessmodel/tokenizer_config.json',
 '/content/drive/MyDrive/Fine Tuning/fitnessmodel/special_tokens_map.json',
 '/content/drive/MyDrive/Fine Tuning/fitnessmodel/vocab.json',
 '/content/drive/MyDrive/Fine Tuning/fitnessmodel/merges.txt',
 '/content/drive/MyDrive/Fine Tuning/fitnessmodel/added_tokens.json',
 '/content/drive/MyDrive/Fine Tuning/fitnessmodel/tokenizer.json')

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_path = "/content/drive/MyDrive/Fine Tuning/fitmodel"

# Load the tokenizer and model from the saved directory
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
# result=generator("How can I get healthy hair?")
# print(result)



In [None]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [None]:
model.push_to_hub("fitnessmodel")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Non-default generation parameters: {'max_length': 50, 'do_sample': True}


model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DhritiShah/fitnessmodel/commit/8ee95f679306b50c350142b1089e211e4041e55a', commit_message='Upload model', commit_description='', oid='8ee95f679306b50c350142b1089e211e4041e55a', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("fitnessmodel")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DhritiShah/fitnessmodel/commit/89a61afdedea19461641823be7105622b572d508', commit_message='Upload tokenizer', commit_description='', oid='89a61afdedea19461641823be7105622b572d508', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
result=generator("I am traveling right now,so can I maintain my fitness?")
print(result)

[{'generated_text': 'I am traveling right now,so can I maintain my fitness? what am i doing wrong | its normal to plan a brisk walk but its manageable pack resistance bands and light weights to support muscle and strength\n consider interval training choose interval training in your fitness'}]


In [None]:
result=generator("How can I improve my posture?")
print(result)

[{'generated_text': 'How can I improve my posture? | good posture is essential strengthen core muscles with exercises like planks and bridges be mindful of your sitting and standing positions consider yoga or pilates to enhance flexibility and alignment'}]


In [None]:
result=generator("what are some practical tips for staying hydrated throughout the day",max_length=600)
print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'what are some practical tips for staying hydrated throughout the day |  carry a reusable water bottle with you wherever you go and refill it throughout the day\n drink a glass of water as soon as you wake up in the morning even before you have coffee or tea\n add fresh fruits and herbs like citrus slices and mint leaves to your water for flavor\n replace sugary soda or other drinks with a glass of water\n during meals alternate between bites and sips of water \n during meals alternate between bites and sips of water \n during meals alternate between bites and sips of water \n during meals alternate between bites and sips of water \n during meals alternate between bites and sips of water \n'}]


In [None]:
what are some practical tips for staying hydrated throughout the day

In [None]:
 !pip install openvino optimum-intel gradio

In [None]:
#inference through Openvino
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from optimum.intel import OVModelForCausalLM

model_path = "/content/drive/MyDrive/Fine Tuning/fitmodel"

# Load the tokenizer and model from the saved directory
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = OVModelForCausalLM.from_pretrained(model_path,export=True)

In [None]:
import random
import time

from transformers import pipeline
import gradio as gr

# Initialize the DistilGPT-2 text generation pipeline
generator = pipeline('text-generation', model="DhritiShah/fitnessmodel")

def gpt2_generate_text(prompt, max_length=100):
    # Generate text using the DistilGPT-2 model
    generated = generator(prompt, max_length=max_length, num_return_sequences=1)
    text = generated[0]['generated_text']
    if "|" in text:
        text = text.split("|", 1)[1].strip()
    return text


with gr.Blocks() as demo:
    gr.Markdown("# Fitness Chatbot")
    gr.Markdown("### Example questions to ask: 'How can I manage stress and maintain healthy worklife balance', 'I have a busy travel schedule how can I maintain fitness routine', 'How do I prevent overthinking', 'How can I improve body posture'")
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    submit=gr.Button("Submit")
    clear = gr.Button("Clear")



    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        bot_message = gpt2_generate_text(history[-1][0])
        history[-1][1] = ""
        for character in bot_message:
            history[-1][1] += character
            time.sleep(0.05)
            yield history

    submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()