In [None]:
# Install necessary packages
!pip install accelerate -U
!pip install transformers datasets
!pip install torch


Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/265.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

In [None]:
# Import required libraries
import pandas as pd
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


In [None]:
# Create necessary directories for combined text and model output
os.makedirs('/content/combined_text', exist_ok=True)
os.makedirs('/content/model_output', exist_ok=True)


In [None]:
!unzip test_data.zip # Unzip the test data and remove unnecessary files


Archive:  test_data.zip
   creating: test_data/
  inflating: test_data/daily_read-log.csv  
  inflating: test_data/weekly_grocery-log.csv  
  inflating: test_data/weekly_hobby-log.csv  
  inflating: test_data/travel-log.csv  
  inflating: test_data/persona.json  
  inflating: test_data/marriages-log.csv  
  inflating: test_data/moves-log.csv  
  inflating: test_data/weekly_bakeorcook-log.csv  
  inflating: test_data/weekly_dating-log.csv  
  inflating: test_data/travel_dining-log.csv  
  inflating: test_data/monthly_pet_care-log.csv  
  inflating: test_data/test.json     
  inflating: test_data/daily_meal-log.csv  
  inflating: test_data/daily_exercise-log.csv  
  inflating: test_data/daily_chat-log.csv  
  inflating: test_data/daily_watchtv-log.csv  
  inflating: test_data/travel_places_visited-log.csv  
  inflating: test_data/annual_medical_care-log.csv  


In [None]:
!rm test_data.zip
!rm -r __MACOSX

rm: cannot remove '__MACOSX': No such file or directory


In [None]:
import json
# Function to rewrite JSON files by removing unnecessary keys

def rewrite_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    for date, events in data.items():
        for event, details in events.items():
            # Remove 'eid', 'text_model_based', and 'multihop_qa_pairs' from each event
            details.pop('eid', None)
            details.pop('text_model_based', None)
            details.pop('multihop_qa_pairs', None)

    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

# Example usage
file_path = '/content/test_data/test.json'  # Replace with your file path

rewrite_json_file(file_path)


In [None]:
# Functions to read different file types  and combine text data
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [None]:
# Process CSV Data
def process_csv(folder_path):
    combined_csv_text = ""
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        if file.endswith('.csv'):
            df = pd.read_csv(file_path)
            combined_csv_text += ' '.join(df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)) + ' '
    return combined_csv_text

# Process Persona JSON Data
def process_persona(folder_path):
    persona_text=""
    file_path = os.path.join(folder_path, 'persona.json')
    with open(file_path, 'r') as file:
        persona = json.load(file)
        persona_text = ' '.join([f"{key}: {value}" for key, value in persona.items()])
    return persona_text

# Process Main Lifelog JSON Data
def process_json(folder_path):
    file_path = os.path.join(folder_path, 'test.json')
    with open(file_path, 'r') as file:
        data = json.load(file)
        qa_text = ""
        for date, events in data.items():
            for event, details in events.items():
                # Add the date and text template representation
                text_template = details.get("text_template_based", "")
                qa_text += f" On {date} {text_template} "
                # Add question-answer pairs
                for pair in details.get("atomic_qa_pairs", []):
                    qa_text += f"Question: {pair[0]} Answer: {pair[1]} "
    return qa_text

def read_documents_from_directory(directory):
    combined_text = ""
    csv_text = process_csv(directory)
    persona_text = process_persona(directory)
    json_text = process_json(directory)
    combined_text = persona_text + ' ' + csv_text + ' ' + json_text
    return combined_text

In [None]:
train_directory = '/content/test_data'

In [None]:

# Read documents from the directory
#train_directory = '/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/training_data/full_text'
# train_directory = '/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/training_data/q_and_a'
text_data = read_documents_from_directory(train_directory)
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [None]:
with open("/content/combined_text/train.txt", "w") as f:
    f.write(text_data)

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [None]:
# Returns the configured data collator object
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,# Sets masked language modeling based on the mlm flag
    )
    return data_collator


In [None]:
# Train the model with the prepared dataset

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          save_steps=save_steps,
          save_total_limit=3
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [None]:
# Define training parameters and initiate training


#train_file_path = "/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/combined_text/full_text/train.txt"
train_file_path = "/content/combined_text/train.txt"
model_name = 'gpt2'
#output_dir = '/content/drive/MyDrive/ColabNotebooks/models/chat_models/custom_full_text'
output_dir = '/content/model_output/custom_q_and_a'
overwrite_output_dir = False
per_device_train_batch_size = 32
num_train_epochs = 30.0
save_steps = 5000

save_total_limit=3

In [None]:

# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
    # save_total_limit=save_total_limit
)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss
500,0.5077
1000,0.3989
1500,0.3793
2000,0.3712
2500,0.3639
3000,0.3607
3500,0.354
4000,0.3505
4500,0.3501
5000,0.3443


In [None]:
!zip -r model_output.zip model_output


  adding: model_output/ (stored 0%)
  adding: model_output/custom_q_and_a/ (stored 0%)
  adding: model_output/custom_q_and_a/training_args.bin (deflated 51%)
  adding: model_output/custom_q_and_a/special_tokens_map.json (deflated 74%)
  adding: model_output/custom_q_and_a/checkpoint-20000/ (stored 0%)
  adding: model_output/custom_q_and_a/checkpoint-20000/training_args.bin (deflated 51%)
  adding: model_output/custom_q_and_a/checkpoint-20000/config.json (deflated 52%)
  adding: model_output/custom_q_and_a/checkpoint-20000/trainer_state.json (deflated 79%)
  adding: model_output/custom_q_and_a/checkpoint-20000/generation_config.json (deflated 24%)
  adding: model_output/custom_q_and_a/checkpoint-20000/model.safetensors (deflated 7%)
  adding: model_output/custom_q_and_a/checkpoint-20000/optimizer.pt (deflated 8%)
  adding: model_output/custom_q_and_a/checkpoint-20000/scheduler.pt (deflated 55%)
  adding: model_output/custom_q_and_a/checkpoint-20000/rng_state.pth (deflated 25%)
  adding:

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!mkdir -p '/content/drive/My Drive/gator_sched'

!cp /content/model_output.zip '/content/drive/My Drive/gator_sched/'

