In [None]:
!pip install fasttext
!pip install huggingface_hub
!pip install transformers
!pip install torch
!pip install pdf2image
!pip install pytesseract
!pip install python-docx
!apt-get install poppler-utils
!pip install fpdf
!pip install docx
!apt-get install tesseract-ocr

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246564 sha256=2892fd3a030a52

In [None]:
#1: Parse Documents

#2: Train Source Material Model

#3: Define the Query and Tokenize

#4: Retrieve Information Using the Source Material Model

#5: Combine and Contextualize Information

#6: Compose final answer using Interrogator Model

In [None]:
#1: Parse Documents

import pytesseract
from pdf2image import convert_from_path
import docx
import os
from fpdf import FPDF
from docx import Document

# Create a Dummy Directory with Sample Documents

# Create a directory for your dummy files
dummy_dir = "dummy_documents"
os.makedirs(dummy_dir, exist_ok=True)

def create_updated_word_doc(filepath):
    doc = Document()
    doc.add_heading('Contract Agreement', 0)
    doc.add_paragraph("This contract specifies the payment terms as net 45 days.\nI like to bbq in the summer")
    doc.save(filepath)

def create_updated_pdf(filepath):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Contract Agreement", ln=True, align="C")
    pdf.multi_cell(0, 10, txt="This contract specifies the payment terms as net 45 days.\nThe project must be completed within 4 months.")
    pdf.output(filepath)

# Generate the dummy files with updated content
create_updated_word_doc(os.path.join(dummy_dir, "contract_1.docx"))
create_updated_pdf(os.path.join(dummy_dir, "contract_2.pdf"))

print(f"Dummy directory '{dummy_dir}' with updated sample documents created.")

# Parse Documents from the Dummy Directory

def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {e}")
        return ""

def extract_text_from_word(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        print(f"Error extracting text from Word document {docx_path}: {e}")
        return ""

def parse_documents(directory):
    document_texts = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            text = extract_text_from_pdf(filepath)
        elif filename.endswith(".docx"):
            text = extract_text_from_word(filepath)
        else:
            continue  # Skip non-supported files
        if text.strip():  # Ensure non-empty text
            document_texts.append(text)
    return document_texts

# Parse the documents in the dummy directory
documents = parse_documents(dummy_dir)
print("Parsed Documents:")
for doc in documents:
    print(doc)


Dummy directory 'dummy_documents' with updated sample documents created.
Parsed Documents:
Contract Agreement
This contract specifies the payment terms as net 45 days.

The project must be completed within 4 months.

Contract Agreement
This contract specifies the payment terms as net 45 days.
I like to bbq in the summer


In [None]:
#2: Train Source Material Model

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
import torch

# Prepare the Dataset
train_texts = documents  # Parsed documents
train_labels = [0] * len(documents)  # Dummy labels (for demonstration purposes)

# Load a pre-trained model and tokenizer
model_name = "bert-base-uncased"  # You can choose another model if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)

# Create a dataset object
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_encodings, train_labels)

# Set Up Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,  # Updated for better logging granularity
)

# Initialize and Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.5361911455790201, metrics={'train_runtime': 13.9981, 'train_samples_per_second': 0.429, 'train_steps_per_second': 0.214, 'total_flos': 73999984320.0, 'train_loss': 0.5361911455790201, 'epoch': 3.0})

In [None]:
#3: Define the Query and Tokenize
query = "What are the payment terms mentioned in the contract?"

# Tokenize the query
query_encoding = tokenizer(query, return_tensors='pt')

In [None]:
#4: Retrieve Information using the Source Material Model
def retrieve_information(query_encoding, documents, model, threshold=0.1):  # Lower threshold
    retrieved_info = []
    for doc in documents:
        # Tokenize the document
        doc_encoding = tokenizer(doc, return_tensors='pt', truncation=True, padding=True, max_length=512)
        # Use the model to predict relevance
        with torch.no_grad():
            outputs = model(**doc_encoding)
            logits = outputs.logits
            relevance_score = torch.softmax(logits, dim=-1)[0, 1].item()  # Get relevance score

            # Debugging: Print out the relevance score for each document
            print(f"Document: {doc[:30]}... Relevance Score: {relevance_score}")

            if relevance_score > threshold:  # Adjusted threshold for relevance
                retrieved_info.append((doc, relevance_score))

    # Sort by relevance score
    retrieved_info.sort(key=lambda x: x[1], reverse=True)
    return retrieved_info

# Step 3.3: Get the Retrieved Information
retrieved_info = retrieve_information(query_encoding, documents, model)

# Print the Retrieved Information
print("Retrieved Information:")
if not retrieved_info:
    print("No relevant information found.")
else:
    for info in retrieved_info:
        print(info[0])

Document: Contract Agreement
This contra... Relevance Score: 0.3628292381763458
Document: Contract Agreement
This contra... Relevance Score: 0.4159141182899475
Retrieved Information:
Contract Agreement
This contract specifies the payment terms as net 45 days.
I like to bbq in the summer
Contract Agreement
This contract specifies the payment terms as net 45 days.

The project must be completed within 4 months.



In [None]:
#5: Combine and Contextualize Information

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

summary_model_name = "facebook/bart-large-cnn"  # Model for summarization // Alternative: "t5-small"
summary_tokenizer = AutoTokenizer.from_pretrained(summary_model_name)
summary_model = AutoModelForSeq2SeqLM.from_pretrained(summary_model_name)

def combine_and_contextualize_info(retrieved_info, summary_model, summary_tokenizer, max_length=200):
    combined_text = "\n".join([info[0] for info in retrieved_info])
    print("Combined Text:")
    print(combined_text)

    # Prepare the input for the summarization model
    inputs = summary_tokenizer.encode("summarize: " + combined_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the summary
    summary_ids = summary_model.generate(
        inputs,
        max_length=max_length,
        min_length=10,  # Ensure minimum length to avoid overly short summaries
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
        do_sample=True,         # Enable sampling
        temperature=0.1,        # Adjust temperature for balance between randomness and coherence
        top_p=0.9               # Nucleus sampling to include top-p probability mass
    )

    summary = summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Get the Contextualized Answer
contextualized_answer = combine_and_contextualize_info(retrieved_info, summary_model, summary_tokenizer)
print("Contextualized Answer:")
print(contextualized_answer)


Combined Text:
Contract Agreement
This contract specifies the payment terms as net 45 days.
I like to bbq in the summer
Contract Agreement
This contract specifies the payment terms as net 45 days.

The project must be completed within 4 months.

Contextualized Answer:
The project must be completed within 4 months. The payment terms are net 45 days.


In [None]:
#6: Compose final answer using Interrogator Model

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load pre-trained models and tokenizers
qa_model_name = "distilbert-base-uncased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)

def get_answer(context, question):
    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

    # Get the model's prediction
    with torch.no_grad():
        outputs = qa_model(**inputs)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    # Get the most likely start and end token positions
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    # Decode the answer from the token indices
    answer_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index])
    answer = tokenizer.convert_tokens_to_string(answer_tokens)

    return answer

# Get the final answer
final_answer = get_answer(contextualized_answer, query)

# Present the final answer
def present_answer(answer, query):
    """
    Function to present the final answer to the user in a more user-friendly way.
    """
    # Format the answer
    formatted_answer = f"Question: {query}\n\nAnswer:\n{answer}"

    # Print the formatted answer to the console
    print(formatted_answer)

# Present the answer
present_answer(final_answer, query)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Question: What are the payment terms mentioned in the contract?

Answer:
net 45 days
