In [1]:
'''!pip install transformers
!pip install torch
!pip install -U PyPDF2
!pip install python-docx'''

'!pip install transformers\n!pip install torch\n!pip install -U PyPDF2\n!pip install python-docx'

# 1. Libraries 

In [2]:
import os
import re
from PyPDF2 import PdfReader
import docx
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# 2. Functions

## 2.1 Reading functions

In [3]:
#Functions for reading PDF, Docx, txt or a whole directory. 
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path): 
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path): 
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

## 2.2 Training Functions

In [4]:
def train_chatbot(directory, model_output_path, train_fraction=0.8):
    # Read documents from the directory
    combined_text = read_documents_from_directory(directory)
    combined_text = re.sub(r'\n+', '\n', combined_text).strip()  # Remove excess newline characters

    # Split the text into training and validation sets
    split_index = int(train_fraction * len(combined_text))
    train_text = combined_text[:split_index]
    val_text = combined_text[split_index:]

    # Save the training and validation data as text files
    with open("train.txt", "w") as f:
        f.write(train_text)
    with open("val.txt", "w") as f:
        f.write(val_text)

    # Tokenizer and Model Preparation
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")  #Setting tokenizer for GP2 Large
    model = GPT2LMHeadModel.from_pretrained("gpt2-medium")  #Setting the model to GPT2 Large

    # Dataset Preparation
    train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)
    val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Training Arguments
    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=30,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()
    trainer.save_model(model_output_path)
    
    # Save the tokenizer
    tokenizer.save_pretrained(model_output_path)


## 2.3 Answering function

In [5]:
def generate_response(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


## 2.4 Main Function for the program

In [6]:
def main():
    directory = "Documents"  # Replace with the path to your directory containing the files
    model_output_path = "ModelOutput"

    # Train the chatbot
    train_chatbot(directory, model_output_path)

    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_output_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

    # Test the chatbot
    prompt = "Norma Boliviana"  # PROMPT! Introduce the Prompt Here!
    response = generate_response(model, tokenizer, prompt)
    print("Generated response:", response)
     

In [7]:
if __name__ == "__main__":
    main()



  0%|          | 0/10380 [00:00<?, ?it/s]

{'loss': 2.6778, 'learning_rate': 4.759152215799615e-05, 'epoch': 1.45}
{'loss': 2.1097, 'learning_rate': 4.518304431599229e-05, 'epoch': 2.89}
{'loss': 1.7656, 'learning_rate': 4.2774566473988445e-05, 'epoch': 4.34}
{'loss': 1.4837, 'learning_rate': 4.036608863198459e-05, 'epoch': 5.78}
{'loss': 1.2201, 'learning_rate': 3.7957610789980736e-05, 'epoch': 7.23}
{'loss': 0.9925, 'learning_rate': 3.554913294797688e-05, 'epoch': 8.67}
{'loss': 0.7999, 'learning_rate': 3.314065510597303e-05, 'epoch': 10.12}
{'loss': 0.6209, 'learning_rate': 3.073217726396917e-05, 'epoch': 11.56}
{'loss': 0.512, 'learning_rate': 2.832369942196532e-05, 'epoch': 13.01}
{'loss': 0.3946, 'learning_rate': 2.5915221579961463e-05, 'epoch': 14.45}
{'loss': 0.3333, 'learning_rate': 2.3506743737957612e-05, 'epoch': 15.9}
{'loss': 0.2658, 'learning_rate': 2.1098265895953757e-05, 'epoch': 17.34}
{'loss': 0.225, 'learning_rate': 1.8689788053949906e-05, 'epoch': 18.79}
{'loss': 0.1947, 'learning_rate': 1.628131021194605e-0

In [8]:
# 3. Using the trained model

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [6]:
def generate_response(model, tokenizer, prompt, max_length=250):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [7]:
model_path = "ModelOutput"
# Load the fine-tuned model and tokenizer
my_chat_model = GPT2LMHeadModel.from_pretrained(model_path)
my_chat_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [9]:
prompt = "Factor de potencia"  # Replace with your desired prompt
#prompt = "What is the most promising future technology?"
response = generate_response(my_chat_model, my_chat_tokenizer, prompt, max_length=100)  #
print("Generated response:", response)

Generated response: Factor de potencia 
Es la relación entre la demanda máxima y la potencia total instalada para satisfacer está 
demanda, es valido para un determinado punto y período de tiempo. 
 
2.74 Falla 
 
Unión entre dos puntos a potencial diferente o ausencia temporal o permanente de la 
energía
