In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install torch
!pip install -U PyPDF2
!pip install python-docx

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import os
import re
import PyPDF2 as PDFReader
import docx
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


In [None]:
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PDFReader.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ''
    for paragraph in doc.paragraphs:
        text += paragraph.text + '\n'
    return text

def read_txt(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
  combined_text = ''
  for filename in os.listdir(directory):
    file_path =  os.path.join(directory, filename)
    if filename.endswith('.pdf'):
      combined_text += read_pdf(file_path)
    elif filename.endswith('.docx'):
      combined_text += read_word(file_path)
    elif filename.endswith('.txt'):
      combined_text += read_txt(file_path)
  return combined_text

In [None]:
def train_chatbot(directory, model_output_path, train_fraction=0.8):
  combined_text = read_documents_from_directory(directory)
  combined_text = re.sub(r'\n+', '\n', combined_text).strip()

  split_index = int(train_fraction * len(combined_text))
  train_text = combined_text[:split_index]
  test_text = combined_text[split_index:]

  with open('train.txt', 'w') as f:
    f.write(train_text)

  with open('test.txt', 'w') as f:
    f.write(test_text)

  tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # gpt2, gpt2-large, gpt2-medium and gpt2-xl
  model = GPT2LMHeadModel.from_pretrained('gpt2') # gpt2, gpt2-large, gpt2-medium and gpt2-xl

  train_dataset = TextDataset(tokenizer=tokenizer, file_path='train.txt', block_size=64) #128
  test_dataset = TextDataset(tokenizer=tokenizer, file_path='test.txt', block_size=64) #128
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

  training_args = TrainingArguments(
      output_dir=model_output_path,
      overwrite_output_dir=True,
      per_device_train_batch_size=4, # increase later
      per_device_eval_batch_size=4, # increase later
      num_train_epochs=50, # increase later
      save_steps = 10_000,
      save_total_limit=2,
      logging_dir='./logs',
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
  )

  trainer.train()
  trainer.save_model(model_output_path)

  tokenizer.save_pretrained(model_output_path)

In [None]:
def generate_response(model, tokenizer, prompt, max_length=100):
  input_ids = tokenizer.encode(prompt, return_tensors='pt')

  attention_mask = torch.ones_like(input_ids)
  pad_token_id = tokenizer.eos_token_id

  output = model.generate(
      input_ids,
      max_length=max_length,
      num_return_sequences=1,
      attention_mask=attention_mask,
      pad_token_id=pad_token_id,
  )
  response = tokenizer.decode(output[0], skip_special_tokens=True)
  return response

In [None]:
def main():
    directory = '/content/drive/MyDrive/Naman_LLM'  # Replace with the path to your directory containing the files
    model_output_path = "/content/drive/MyDrive/Naman_LLM/model_2"

    # Train the chatbot
    train_chatbot(directory, model_output_path)

    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_output_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

    # Test the chatbot
    prompt = "What is NCQA?"  # Replace with your desired prompt
    response = generate_response(model, tokenizer, prompt)
    print("Generated response:", response)

In [None]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss
500,2.0557
1000,1.1924
1500,0.6503
2000,0.3516
2500,0.2154
3000,0.15
3500,0.1186
4000,0.1005
4500,0.0902
5000,0.0813


Generated response: What is NCQA?
NCQA
stands
for
Council
for
Affordable
Quality
Health-
Portal
which
maintains
the
conflict
between
providers
and
clients.
It
is
the
documentation
required
for
providers
to
stand
for
an
NCQA
provider.
TOPIC
4:
Education,
Departments
and
Special


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
def generate_response(model, tokenizer, prompt, max_length=500):
  input_ids = tokenizer.encode(prompt, return_tensors='pt')

  attention_mask = torch.ones_like(input_ids)
  pad_token_id = tokenizer.eos_token_id

  output = model.generate(
      input_ids,
      max_length=max_length,
      num_return_sequences=1,
      attention_mask=attention_mask,
      pad_token_id=pad_token_id,

  )
  response = tokenizer.decode(output[0], skip_special_tokens=True)
  return response

In [None]:
model_path = "/content/drive/MyDrive/Naman_LLM/model_2"

my_chat_model = GPT2LMHeadModel.from_pretrained(model_path)
my_chat_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [None]:
    # Test the chatbot
    prompt = "What is Credentialing?"  # Replace with your desired prompt
    response = generate_response(my_chat_model, my_chat_tokenizer, prompt, max_length=500)
    print("Generated response:", response)

Generated response: What is Credentialing?
Credentialing
is
a
vital
process
for
healthcare
institutions.
In
simple
terms,
credentialing
is
the
process
of
verifying/assessing
the
work
of
a
provider.
In
faster
terms,
credentialing
is
the
process
of
assessing
the
satisfaction
of
members
in
a
network.
In
simple
terms,
credentialing
is
the
process
of
verifying/assessing
the
quality
of
education
provided
to
members.
In
faster
terms,
credentialing
is
the
process
of
assessing
the
satisfaction
of
payers/
providers.
In
fact,
credentialing
is
the
process
of
assessing
the
satisfaction
of
payers/
providers.
In
fact,
credentialing
is
the
process
of
assessing
the
efficiency
of
our
medical
system.
In
fact,
credentialing
is
the
process
of
assessing
the
efficiency
of
our
efforts/
risk-management
efforts.
In
fact,
credentialing
is
the
process
of
assessing
the
satisfaction
of
payers/
providers.
In
fact,
credentialing
is
the
process
of
assessing
the
satisfaction
of
payers/
providers.
In
fact,
credentialing