# Installation

In [27]:
# !pip uninstall -y ipython
# !pip install ipython
# !pip install python-dotenv

In [26]:
# !pip install transformers
# !pip install ipywidgets
# !pip3 install torch torchvision torchaudio
# !pip install spacy sentence-transformers
# !python -m spacy download en_core_web_sm
# !pip install pdfplumber
# !pip install pandas
# !pip install numpy --upgrade

# Imports

In [30]:
import os
import re
import sys
from dotenv import load_dotenv

import numpy as np
import pandas as pd

import pdfplumber

In [2]:
import IPython.display

sys.modules["IPython.core.display"] = IPython.display

import spacy
from spacy import displacy

In [3]:
import logging

logging.getLogger("pdfminer").setLevel(logging.ERROR)

In [20]:
from sentence_transformers import SentenceTransformer

In [31]:
load_dotenv()

True

# Constants

In [4]:
PDF_FOLDER = "data/"
PDF_EXAMPLE = "2025-OJS100-00339516-en.pdf"

# Functions

In [5]:
def load_pdf(file_path):
    """Load the PDF file and extract text from all pages."""
    with pdfplumber.open(file_path) as pdf:
        pages = pdf.pages
        text = '\n'.join(page.extract_text() for page in pages)
    return text

In [6]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Read Data

In [12]:
file_name = os.path.join(PDF_FOLDER, PDF_EXAMPLE)
tender_text = load_pdf(file_name)

In [13]:
print(tender_text[:1000])

339516-2025 - Competition See the notice on TED website
Sweden – IT services: consulting, software development, Internet and support – Support,
underhåll och uppgraderingar - Diver och Spectre
OJ S 100/2025 26/05/2025
Contract or concession notice – standard regime
Services
1. Buyer
1.1. Buyer
Official name: Tillväxtverket
Email: kjell.gunnarsson@tillvaxtverket.se
Legal type of the buyer: Central government authority
Activity of the contracting authority: General public services
2. Procedure
2.1. Procedure
Title: Support, underhåll och uppgraderingar - Diver och Spectre
Description: Tillväxtverket avser att upphandla programvaruunderhåll och teknisk support för
The Diver Platform, Spectre och Measure factory (programvarorna).
Procedure identifier: 37b313d6-8055-4085-be59-39490e04e57b
Internal identifier: Ä 2025-1172
Type of procedure: Open
The procedure is accelerated: no
2.1.1. Purpose
Main nature of the contract: Services
Main classification (cpv): 72000000 IT services: consulting, s

# Rule-based extraction

In [14]:
def extract_links(text):
    """
    Extracts all URLs from the input text.
    """
    url_pattern = r'https?://[^\s<>"\'()]+'
    return re.findall(url_pattern, text)

extract_links(tender_text)

['https://tendsign.com/doc.aspx?',
 'https://tendsign.com/doc.aspx?',
 'https://tendsign.com/doc.aspx?',
 'https://tendsign.com/doc.aspx?',
 'http://www.tillvaxtverket.se',
 'https://www.domstol.se/forvaltningsratten-i-stockholm/']

In [15]:
def extract_emails(text):
    """
    Extracts all email addresses from the input text.
    """
    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    return re.findall(email_pattern, text)

extract_emails(tender_text)

['kjell.gunnarsson@tillvaxtverket.se',
 'kjell.gunnarsson@tillvaxtverket.se',
 'forvaltningsrattenistockholm@dom.se']

# Basic NER with SpaCy

In [16]:
nlp = spacy.load("en_core_web_sm")

In [17]:
text_sample = "What video sharing service did Steve Chen, Chad Hurley, and Jawed Karim create in 2005?"
doc = nlp(text_sample)

displacy.render(doc, style="ent", jupyter=True)

In [19]:
doc = nlp(tender_text)

displacy.render(doc, style="ent", jupyter=True)

# Simple Semantic Search

In [24]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [25]:
# possible queries
queries = [
    "IT services",
    "License and ISO",
    "Audited financial statements",
]

query_embeddings = model.encode(queries, convert_to_tensor=True)

# Split the text into chunks (sentences in this case)
sentences = [sent.text.strip() for sent in doc.sents]

# Create embeddings for all sentences
sentence_embeddings = model.encode(sentences)

# For each query, find most similar sentences
top_k = 3
for query in queries:
    print(f"\nQuery: {query}")
    print("Most relevant sentences:")
    
    # Get query embedding
    query_embedding = model.encode(query)
    
    # Calculate similarities for all sentences
    similarities = []
    for idx, sent_embedding in enumerate(sentence_embeddings):
        score = cosine_similarity(query_embedding, sent_embedding)
        similarities.append((idx, score))
    
    # Sort by similarity and get top k
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_matches = similarities[:top_k]
    
    # Print results
    for idx, score in top_matches:
        print(f"- {sentences[idx]} (Score: {score:.4f})")
        print("=" * 50)



Query: IT services
Most relevant sentences:
- Purpose
Main nature of the contract: Services
Main classification (cpv): 72000000 IT services: consulting, software development, Internet
and support
Additional classification (cpv): 72266000 Software consultancy services
2.1.2. (Score: 0.3814)
- Purpose
Main nature of the contract: Services
Main classification (cpv): 72000000
IT services: consulting, software development, Internet and support
Additional classification (cpv): 72266000 Software consultancy services
5.1.2. (Score: 0.3803)
- Purpose
Main nature of the contract: Services
Main classification (cpv): 72000000
IT services: consulting, software development, Internet and support
Additional classification (cpv): 72266000 Software consultancy services
5.1.2. (Score: 0.3803)

Query: License and ISO
Most relevant sentences:
- General information
Legal basis:
Directive 2014/24/EU
2.1.5. (Score: 0.2950)
- 339516-2025 - Competition See the notice on TED website
Sweden – IT services: consul

# Structured Output with LLM

In [34]:
# !pip install openai

In [38]:
from openai import OpenAI
from pydantic import BaseModel

client = OpenAI()

class TenderContent(BaseModel):
    country: str
    service_type: str
    company_name: str
    budget: float
    start_date: str
    end_date: str
    description: str

response = client.responses.parse(
    model="gpt-4o-2024-08-06",
    input=[
        {"role": "system", "content": "Extract the tender information."},
        {
            "role": "user",
            "content": tender_text,
        },
    ],
    text_format=TenderContent,
)

tender_info = response.output_parsed

In [42]:
tender_info.model_dump(by_alias=True, exclude_unset=True)

{'country': 'Sweden',
 'service_type': 'IT services: consulting, software development, Internet and support',
 'company_name': 'Tillväxtverket',
 'budget': 4200000.0,
 'start_date': '01/08/2025',
 'end_date': '31/07/2032',
 'description': 'Tillväxtverket is seeking to procure software maintenance and technical support for The Diver Platform, Spectre, and Measure factory.'}

# OCR free extraction with VLM

In [49]:
file = client.files.create(
    file=open(os.path.join(PDF_FOLDER, PDF_EXAMPLE), "rb"),
    purpose="user_data"
)

response = client.responses.parse(
    model="gpt-4.1",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    "text": "Extract the key tender information.",
                },
            ]
        }
    ],
    text_format=TenderContent,
)

tender_info = response.output_parsed
tender_info.model_dump(by_alias=True, exclude_unset=True)

{'country': 'Sweden',
 'service_type': 'IT services: consulting, software development, Internet and support, including software maintenance and technical support for The Diver Platform, Spectre and Measure Factory',
 'company_name': 'Tillväxtverket',
 'budget': 4200000.0,
 'start_date': '2025-08-01',
 'end_date': '2032-07-31',
 'description': 'Procurement of software maintenance and technical support for The Diver Platform, Spectre, and Measure Factory. The contract includes IT consulting, software development, Internet support, software consultancy services, and associated technical and quality requirements for accessibility and environmental standards. Open procedure for service providers across Sweden, with electronic submission via the given portal.'}