In [9]:
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset

In [4]:
# 1. Extract Data from PDFs
# You'll need a tool to extract text and tables from PDFs. Some popular options include:
# Parsio: An AI parser that can extract complex tables from PDFs1
# Microsoft AI Builder: Allows you to train a model to extract data from PDFs2
# Hugging Face LayoutLMv3: A state-of-the-art model for document layout analysis3

import pdfplumber
import pandas as pd

def extract_text_tables_from_pdf(pdf_path):
    text_data = ""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text
            text_data += page.extract_text() + "\n\n"
            # Extract tables
            page_tables = page.extract_tables()
            tables.extend(page_tables)
    return text_data, tables

pdf_path = 'M-Review_July 2024 2-1-22.pdf'
text_data, tables = extract_text_tables_from_pdf(pdf_path)
for table in tables:
    df = pd.DataFrame(table[1:], columns=table[0])


In [6]:

# 2. Preprocess the Data
# Clean and preprocess the extracted data to make it suitable for training. This might involve:
# Removing unnecessary elements like headers, footers, and page numbers4
# Converting tables into a structured format (e.g., CSV).
def preprocess_tables(tables):
    dataframes = []
    for table in tables:
        df = pd.DataFrame(table[1:], columns=table[0])
        dataframes.append(df)
    return dataframes

dataframes = preprocess_tables(tables)


In [7]:
qa_pairs = [
    {"question": "What is the total sales?", "answer": "The total sales is $1,000,000."},
    {"question": "Who is the top salesperson?", "answer": "The top salesperson is John Doe."},
    # Add more question-answer pairs based on your data
]

qa_df = pd.DataFrame(qa_pairs)
print(qa_df.head())


                      question                            answer
0     What is the total sales?    The total sales is $1,000,000.
1  Who is the top salesperson?  The top salesperson is John Doe.


In [None]:

# 3. Train Your Model
# You can use open-source models to train your AI:
# GPT-NeoX-20B: A versatile language model for text generation5
# Falcon 180B: An advanced language model with 180 billion parameters5
# .

# LangChain: A platform to train a large language model (LLM) on your PDF data6
# .



# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Prepare dataset
train_dataset = Dataset.from_pandas(qa_df)

def preprocess_function(examples): 
    return tokenizer(examples['question'], truncation=True, padding=True)

train_dataset = train_dataset.map(preprocess_function, batched=True)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Training arguments
training_args = TrainingArguments(output_dir="./results", num_train_epochs=3, per_device_train_batch_size=8)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset  # Example: using the same dataset for simplicity
)

# Train the model
trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [None]:
# 4. Generate Insights
# Once your model is trained, you can use it to generate insights and answer questions about the data:

# Pecan AI: A tool for data synthesis and predictive modeling7
# .

# IBM AI Analytics: Uses machine learning and natural language processing to interpret data8
# .



In [12]:
def abc(a:int)->int:
    for i in range(a):
        print(i)
        
    return 'abc'

In [13]:
a = abc(3)

0
1
2
