In [1]:
# Install necessary libraries
!pip install transformers torch pandas faker pymupdf

import fitz  # PyMuPDF
import pandas as pd
from faker import Faker
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Function to extract text from a PDF
def extract_pdf_text(file_path: str) -> str:
    pdf_document = fitz.open(file_path)
    full_text = ""

    for page_idx in range(pdf_document.page_count):
        page = pdf_document.load_page(page_idx)
        full_text += page.get_text()

    return full_text

# Function to generate synthetic contract data
def create_synthetic_contract_data() -> pd.DataFrame:
    fake_data_generator = Faker()
    service_provider = fake_data_generator.company()
    client_company = fake_data_generator.company()
    contract_amount = fake_data_generator.random_number(digits=5)
    contract_start_date = fake_data_generator.date_this_year()
    contract_end_date = fake_data_generator.date_this_year()
    governing_state = fake_data_generator.state()
    termination_notice_days = fake_data_generator.random_int(min=30, max=90)

    contract_clauses = [
        [f"{service_provider} agrees to provide the following services to {client_company}. Services include service1, service2, and service3.", 0],
        [f"{client_company} agrees to pay {service_provider} ${contract_amount} for the described services. Payment is due within {termination_notice_days} days of invoice receipt.", 1],
        [f"This contract starts on {contract_start_date} and ends on {contract_end_date} unless terminated earlier per the Termination clause.", 2],
        [f"Both parties will maintain the confidentiality of proprietary information disclosed during this contract. This obligation continues beyond contract termination.", 3],
        [f"Either party can terminate this contract with {termination_notice_days} days written notice. {service_provider} will be paid for services up to the termination date.", 4],
        [f"This contract is governed by the laws of the State of {governing_state}.", 5],
        [f"Service Provider: {service_provider}", 6],
        [f"Client: {client_company}", 6]
    ]
    return pd.DataFrame(contract_clauses, columns=["text", "category"])

# Generate multiple synthetic contract data samples
all_synthetic_data = [create_synthetic_contract_data() for _ in range(10)]
combined_synthetic_data = pd.concat(all_synthetic_data)
combined_synthetic_data.to_csv("/content/synthetic_contract_data.csv", index=False)

# Load a pre-trained BERT model and tokenizer from Hugging Face
pretrained_model_name = "bert-base-uncased"
bert_model = BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=7)
bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

# Set the device to GPU if available, otherwise use CPU
computation_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(computation_device)

# Function to predict the class of a text using the pre-trained BERT model
def classify_text(text):
    # Tokenize the input text
    encoded_input = bert_tokenizer(text, truncation=True, max_length=128, return_tensors='pt', padding='max_length')

    # Move inputs to the device (GPU/CPU)
    encoded_input = {key: value.to(computation_device) for key, value in encoded_input.items()}

    # Perform inference
    bert_model.eval()
    with torch.no_grad():
        output = bert_model(**encoded_input)
        logits = output.logits
        predicted_category = torch.argmax(logits, dim=1)
        return predicted_category.item()

# Main execution
if __name__ == "__main__":
    # Extract text from a sample PDF
    pdf_file_path = "/content/Contract_0.pdf"
    pdf_text = extract_pdf_text(pdf_file_path)
    print(f"Extracted Text from PDF:\n{pdf_text}\n")

    # Example text for inference
    sample_text = "Cole LLC agrees to provide the following services to Hines, Munoz and Dennis. Services are service1, service2, service3."
    sample_prediction = classify_text(sample_text)
    print(f"Predicted class for example text: {sample_prediction}")

    # Predict classes for generated synthetic contract data
    synthetic_contract_df = pd.read_csv("/content/synthetic_contract_data.csv")
    synthetic_contract_df['predicted_category'] = synthetic_contract_df['text'].apply(classify_text)
    print(synthetic_contract_df)

    # Save the predictions to a new CSV file
    synthetic_contract_df.to_csv("/content/synthetic_contract_data_with_predictions.csv", index=False)


Collecting faker
  Downloading Faker-26.0.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-c

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Extracted Text from PDF:
Business Contract
1. Services Provided:
Tanner-Leblanc agrees to provide the following services to Hernandez-Kaiser. services are service1, service2,
service3.
2. Payment:
Hernandez-Kaiser agrees to pay Tanner-Leblanc the amount of $62878 for the services described above. Payment
shall be made within 90 days of receiving an invoice from Tanner-Leblanc.
3. Term:
This contract will commence on 2024-05-04 and will continue until 2024-01-11 unless terminated earlier in
accordance with the Termination clause.
4. Confidentiality:
Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the
term of this contract. This obligation will continue beyond the termination of this contract.
5. Termination:
Either party may terminate this contract with 90 days written notice to the other party. In the event of termination,
Tanner-Leblanc will be compensated for all services performed up to the date of termination.
6. Go