In [2]:
import os
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

In [3]:
# Step 1: Preprocess the text data
def preprocess_text(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    # Additional preprocessing steps can be added here if needed
    return text

In [4]:
# Step 2: Fine-tune a BERT model on the preprocessed data
def fine_tune_bert(file_path):
    text = preprocess_text(file_path)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)

    # Example: Fine-tuning BERT for classification
    labels = torch.tensor([1]).unsqueeze(0)  # Assuming binary classification with a single label
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)  # Use PyTorch AdamW optimizer

    for _ in range(3):  # Train for 3 epochs (you can adjust the number of epochs)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Save the model
    model.save_pretrained(os.path.splitext(file_path)[0] + "_classification_model")

    return model

In [5]:
# Step 4: Fine-tune a BERT model for question answering
def fine_tune_bert_qa():
    qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    return qa_model

In [6]:
# Step 5: Split text into smaller chunks and perform question-answering
def perform_qa_on_chunks(qa_model, text):
    max_chunk_length = 512
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
    answers = []

    for i, chunk in enumerate(chunks):
        # Tokenize the chunk
        inputs = tokenizer(chunk, return_tensors="pt", max_length=max_chunk_length, truncation=True)

        # Perform question-answering task
        with torch.no_grad():
            outputs = qa_model(**inputs)

        # Process outputs
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        # Get the most likely answer
        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
        answers.append(answer)

    return answers

In [7]:
# Fine-tune BERT models for classification on each file
directory = "D:/Fintech_lab/AXP"
classification_models = []
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        model = fine_tune_bert(file_path)
        classification_models.append(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are

In [8]:
# Fine-tune BERT model for question answering
qa_model = fine_tune_bert_qa()
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Define your question
question = "What company is it?"

# Perform question answering on each file
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        text = preprocess_text(file_path)
        inputs = tokenizer(question, text, return_tensors="pt", max_length=512, truncation=True)
        
        # Perform question-answering task
        with torch.no_grad():
            outputs = qa_model(**inputs)

        # Process outputs
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        # Get the most probable answer
        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

        print("File:", filename)
        print("Question:", question)
        print("Answer:", answer)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20161231.txt
Question: What company is it?
Answer: american express company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20171231.txt
Question: What company is it?
Answer: american express company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20181231.txt
Question: What company is it?
Answer: american express is a globally integrated payments company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20191231.txt
Question: What company is it?
Answer: american express is a globally integrated payments company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20201231.txt
Question: What company is it?
Answer: american express is a globally integrated payments company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20211231.txt
Question: What company is it?
Answer: american express is a globally integrated payments company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20221231.txt
Question: What company is it?
Answer: globally integrated payments company


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


File: 20231231.txt
Question: What company is it?
Answer: globally integrated payments company
