In [1]:
!pip install -U PyPDF2
!pip install python-docx



In [2]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx
import glob

In [3]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [4]:
train_directory = '/content/'
text_data = read_documents_from_directory(train_directory)
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [5]:
with open("/content/train.txt", "w") as f:
    f.write(text_data)

In [6]:
!pip install transformers



In [7]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from transformers import Trainer, TrainingArguments
import matplotlib.pyplot as plt

In [8]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [9]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [10]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPTNeoForCausalLM.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
    #loss_values.append(train_results.metrics["train_loss"])
  #print("Loss Values: ",loss_values)


  #print("loss",log_df)
  trainer.save_model()
  loss_values = []
  epoch_values = []
  for log_entry in trainer.state.log_history:
      if "loss" in log_entry.keys():
          loss_values.append(log_entry["loss"])
          epoch_values.append(log_entry["epoch"])

  # Create a DataFrame from the collected loss and epoch values
  log_df = pd.DataFrame({"Epoch": epoch_values, "Loss": loss_values})
  print(log_df)
  return log_df
  trainer.save_model()

In [11]:
train_file_path = "/content/train.txt"
model_name = "EleutherAI/gpt-neo-1.3B"
output_dir = '/content/'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000


In [12]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.22.0


In [None]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps

)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

Step,Training Loss
500,0.5698
1000,0.0455
1500,0.0274
2000,0.0241
2500,0.0227
3000,0.0207
3500,0.0183


In [None]:
from transformers import PreTrainedTokenizerFast, GPTNeoForCausalLM, GPTNeoTokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPTNeoForCausalLM.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path,max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    prompt = input("Enter your prompt: ")
    ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")
    output = model.generate(ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
#    final_outputs = model.generate(
#        ids,
#        do_sample=True,
#        max_length=max_length,
#        pad_token_id=model.config.eos_token_id,
#        top_k=50,
#        top_p=0.95,
#    )
    print("Generated text: ")
    print(tokenizer.decode(output[0], skip_special_tokens=True))








In [None]:
model1_path = "/content/"
max_len = 100
generate_text(model1_path,max_len)