# Chatbox de Préstamos y Créditos Bancarios basado en GPT2

### Libraries 

In [1]:
!pip install --upgrade numpy==1.23.0
!pip install transformers
!pip install torch
!pip install -U PyPDF2
!pip install python-docx



Collecting numpy==1.23.0
  Downloading numpy-1.23.0.tar.gz (10.7 MB)
     ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
     --------------------------------------- 0.0/10.7 MB 145.2 kB/s eta 0:01:14
     --------------------------------------- 0.0/10.7 MB 140.3 kB/s eta 0:01:17
     --------------------------------------- 0.1/10.7 MB 204.8 kB/s eta 0:00:53
     --------------------------------------- 0.1/10.7 MB 261.7 kB/s eta 0:00:41
      -------------------------------------- 0.1/10.7 MB 369.8 kB/s eta 0:00:29
      -------------------------------------- 0.2/10.7 MB 479.2 kB/s eta 0:00:22
      -------------------------------------- 0.2/10.7 MB 497.3 kB/s eta 0:00:22
     - ------------------------------------- 0.4/10.7 MB 734.3 kB/s eta 0:00:15
   

  error: subprocess-exited-with-error
  
  × Building wheel for numpy (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [242 lines of output]
      Running from numpy source directory.
      
        `numpy.distutils` is deprecated since NumPy 1.23.0, as a result
        of the deprecation of `distutils` itself. It will be removed for
        Python >= 3.12. For older Python versions it will remain present.
        It is recommended to use `setuptools < 60.0` for those Python versions.
        For more details, see:
          https://numpy.org/devdocs/reference/distutils_status_migration.html
      
      
        import numpy.distutils.command.sdist
      Cythonizing sources
      Processing numpy/random\_bounded_integers.pxd.in
      Processing numpy/random\bit_generator.pyx
      Processing numpy/random\mtrand.pyx
      Processing numpy/random\_bounded_integers.pyx.in
      Processing numpy/random\_common.pyx
      Processing numpy/random\_generator.pyx
      Process



### Imports

In [1]:
import os
import re
from PyPDF2 import PdfReader
#import docx
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm


### Read .txt and .pdf

In [2]:

# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

"""def read_word(file_path):
    doc = docx.Document(file_path)
    text = 
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text
"""

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        #elif filename.endswith(".docx"):
        #   combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text



The train_chatbot function uses the combined text data to train a GPT-2 model using the provided training arguments. The resulting trained model and tokenizer are then saved to a specified output directory.

In [3]:
def train_chatbot(directory, model_output_path, train_fraction=0.8):

    # Read documents from the directory
    combined_text = read_documents_from_directory(directory)
    combined_text = re.sub(r'\n+', '\n', combined_text).strip()# Remove excess newline characters

    # Split the text into training and validation sets
    split_index = int(train_fraction * len(combined_text)) #where to cut the train text
    train_text = combined_text[:split_index] #[beginning, cut the train text]
    val_text = combined_text[split_index:] #[cut the train text, end]

    # Save the training and validation data as text files
    with open("train.txt", "w") as f:
        f.write(train_text)
    with open("val.txt", "w") as f:
        f.write(val_text)

    # Set up the tokenizer and model. Start the pretrained model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # take a text -> turn into -> tokens sequence 
    model = GPT2LMHeadModel.from_pretrained("gpt2")  # language model to be trained 

    # Prepare the dataset
    train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)
    # create a dataset for training using the token and the train.txt file. 128 is the size for the text block while the training
    val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128) # input sequence size 
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) 
    # prepare the data before to pass to the model, if false, don't hide specific words 

    # Set up the training arguments
    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=100,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()
    trainer.save_model(model_output_path)

    # Save the tokenizer
    tokenizer.save_pretrained(model_output_path)

    # Save the model state in a .pth file
    torch.save(model.state_dict(), os.path.join(model_output_path, "model.pth"))


The generate_response function takes a trained model, tokenizer, and a prompt string as input and generates a response using the GPT-2 model.

In [4]:
def generate_response(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    #encode the input text using the tokenizer, the results should be pytorch tensors 

    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    #create a attention mask with the same size as input_ids, all theirs values are 1's 
    #all the tokens in input_ids, will recibe all the attention while the generation 
    pad_token_id = tokenizer.eos_token_id
    # get the final sequence id token 

    output = model.generate( # generate a text sequence 
        input_ids, # ids sequence tokens representing the input prompt 
        max_length=max_length, #maximum sequence size generated 
        num_return_sequences=1,# num output sequences, 1
        attention_mask=attention_mask, # what tokens should receive attentions 
        pad_token_id=pad_token_id,# token de relleno 
        temperature=0.1 # take control of randomness 
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)
    # decode the generated sequence of token ids to a readable text using teh tokenizer 
    # return the first and unique generated sequence 
    # omit special tokes, such as the beginning token 
    # Decodifica la secuencia de IDs de tokens generada de nuevo a texto legible utilizando el tokenizador

The main function is the entry point for the program. It specifies the path to the directory containing the training data and the path to the output directory for the trained model and tokenizer. It then trains the chatbot using the train_chatbot function and generates a response to a specified prompt using the generate_response function.

In [5]:

def main():
    directory = "./Input/"  # Replace with the path to your directory containing the files
    model_output_path = "./Output/"

    # Train the chatbot
    train_chatbot(directory, model_output_path)

    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_output_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

    # Test the chatbot
    #prompt = "what is a bank"  # Replace with your desired prompt
    #response = generate_response(model, tokenizer, prompt)
    #print("Generated response:", response)

In [6]:
if torch.cuda.is_available():
    print("gpu")

gpu


In [7]:
if __name__ == "__main__":
    main()


  8%|▊         | 500/6600 [05:20<1:03:06,  1.61it/s]

{'loss': 3.0389, 'learning_rate': 4.621212121212121e-05, 'epoch': 7.58}


 15%|█▌        | 1000/6600 [10:32<58:45,  1.59it/s] 

{'loss': 1.9606, 'learning_rate': 4.242424242424243e-05, 'epoch': 15.15}


 23%|██▎       | 1500/6600 [15:31<48:04,  1.77it/s]  

{'loss': 1.1493, 'learning_rate': 3.8636363636363636e-05, 'epoch': 22.73}


 30%|███       | 2000/6600 [20:12<43:24,  1.77it/s]

{'loss': 0.5708, 'learning_rate': 3.484848484848485e-05, 'epoch': 30.3}


 38%|███▊      | 2500/6600 [24:53<38:24,  1.78it/s]

{'loss': 0.2917, 'learning_rate': 3.106060606060606e-05, 'epoch': 37.88}


 45%|████▌     | 3000/6600 [29:34<34:20,  1.75it/s]

{'loss': 0.1713, 'learning_rate': 2.7272727272727273e-05, 'epoch': 45.45}


 53%|█████▎    | 3500/6600 [35:17<27:08,  1.90it/s]   

{'loss': 0.1157, 'learning_rate': 2.3484848484848487e-05, 'epoch': 53.03}


 61%|██████    | 4000/6600 [39:59<24:33,  1.76it/s]

{'loss': 0.0872, 'learning_rate': 1.9696969696969697e-05, 'epoch': 60.61}


 68%|██████▊   | 4500/6600 [44:40<19:51,  1.76it/s]

{'loss': 0.0709, 'learning_rate': 1.590909090909091e-05, 'epoch': 68.18}


 76%|███████▌  | 5000/6600 [49:21<15:06,  1.76it/s]

{'loss': 0.0591, 'learning_rate': 1.2121212121212122e-05, 'epoch': 75.76}


 83%|████████▎ | 5500/6600 [54:02<10:23,  1.77it/s]

{'loss': 0.0508, 'learning_rate': 8.333333333333334e-06, 'epoch': 83.33}


 91%|█████████ | 6000/6600 [58:43<05:38,  1.77it/s]

{'loss': 0.0462, 'learning_rate': 4.5454545454545455e-06, 'epoch': 90.91}


 98%|█████████▊| 6500/6600 [1:03:23<00:56,  1.77it/s]

{'loss': 0.0431, 'learning_rate': 7.575757575757576e-07, 'epoch': 98.48}


100%|██████████| 6600/6600 [1:04:19<00:00,  1.71it/s]


{'train_runtime': 3859.558, 'train_samples_per_second': 6.762, 'train_steps_per_second': 1.71, 'train_loss': 0.5806021789348487, 'epoch': 100.0}


## Now, let us test the model.
<p>
Use the following code if you are only performing inference (generating text). This can be placed in a separate notebook.

In [8]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel


In [9]:
def generate_response(model, tokenizer, prompt, max_length=250):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [10]:
model_path = "./Output/"
# Load the fine-tuned model and tokenizer
my_chat_model = GPT2LMHeadModel.from_pretrained(model_path)
my_chat_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In the case of the GPT-2 tokenizer, the model uses a byte-pair encoding (BPE) algorithm, which tokenizes text into subword units. As a result, one word might be represented by multiple tokens.

For example, if you set max_length to 50, the generated response will be limited to 50 tokens, which could be fewer than 50 words, depending on the text.

In [14]:
prompt = "que es la taza de interes"  # Replace with your desired prompt
#prompt = "What is the most promising future technology?"
response = generate_response(my_chat_model, my_chat_tokenizer, prompt, max_length=100)  #
print("Generated response:", response)

Generated response: que es la taza de intereses. Este intereses se va reduciendo en proporción a la amortización del capital, ya que el capital pendiente de amortización será menor. El resultado de este sistema es que, durante la primera fase, el abono va dirigido principalmente a los intereses, pero a medida que se van pag
