## dependencies


In [1]:
!apt-get install -y tesseract-ocr
!pip install pytesseract pillow pdf2image
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 1 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/ma

##ocr

In [4]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import os

def ocr_image(path):
    img = Image.open(path)
    return pytesseract.image_to_string(img)

def ocr_pdf(path):
    pages = convert_from_path(path)
    text = ""
    for i, page in enumerate(pages):
        text += f"\n--- Page {i+1} ---\n"
        text += pytesseract.image_to_string(page)
    return text

file_path = "/content/sample-report.pdf"  # change file name

if file_path.lower().endswith(".pdf"):
    extracted_text = ocr_pdf(file_path)
else:
    extracted_text = ocr_image(file_path)

with open("ocr_output.txt", "w", encoding="utf-8") as f:
    f.write(extracted_text)

print("OCR done")


OCR done


##SLA and Clause extraction

In [11]:
import re

# ------------------ Helpers ------------------

def clean_text(text):
    # basic normalization to reduce OCR noise
    t = text.replace('\r', ' ').replace('\n', ' ')
    t = re.sub(r'\s+', ' ', t)         # collapse whitespace
    t = t.replace('O', '0')            # optional: common OCR mistake (careful)
    return t.strip()

def parse_amount(s):
    if not s: return None
    s = s.replace(',', '').replace(' ', '')
    # handle $ and parentheses etc
    s = s.replace('(', '').replace(')', '')
    m = re.search(r'(-)?\s*\$?([0-9]*\.?[0-9]+)([kK])?', s)
    if not m:
        # try to extract digits any way
        m2 = re.search(r'([0-9]*\.?[0-9]+)([kK])?', s)
        if not m2: return None
        sign, num, k = None, m2.group(1), m2.group(2)
    else:
        sign, num, k = m.group(1), m.group(2), m.group(3)
    val = float(num)
    if k: val *= 1000
    if sign: val = -val
    return val

def parse_percentage(s):
    if not s: return None
    m = re.search(r'([0-9]+(?:\.[0-9]+)?)', s)
    return float(m.group(1)) if m else None

def parse_int(s):
    if not s: return None
    s = s.replace(',', '').strip()
    m = re.search(r'([0-9]+)', s)
    return int(m.group(1)) if m else None

def find_first(patterns, text):
    """Try a list of patterns; return first match group(1) or None"""
    for p in patterns:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            # prefer group 1 if exists; otherwise full match
            return m.group(1) if m.lastindex else m.group(0)
    return None

# ------------------ Field extractors ------------------

def extract_apr(text):
    patterns = [
        r'(?:APR|annual percentage rate|interest rate)[^\d%]{0,15}([0-9]+(?:\.[0-9]+)?\s*%)',
        r'interest rate[^\d]{0,15}([0-9]+(?:\.[0-9]+)?)\s*percent'
    ]
    raw = find_first(patterns, text)
    return raw, parse_percentage(raw) if raw else (None, None)

def extract_lease_term(text):
    patterns = [
        r'(?:lease term|term of lease|term)[:\s]{0,10}([0-9]{1,3}\s*(?:months|mos|yrs|years|year))',
        r'([0-9]{1,3}\s*(?:months|mos))',
        r'([0-9]{1,2})\s*(?:years|yrs)'
    ]
    raw = find_first(patterns, text)
    if not raw:
        return None, None
    # normalize to months if possible
    m = re.search(r'([0-9]{1,3})\s*(months|mos)', raw, re.IGNORECASE)
    if m:
        return raw, int(m.group(1))
    m2 = re.search(r'([0-9]{1,2})\s*(years|yrs|year)', raw, re.IGNORECASE)
    if m2:
        return raw, int(m2.group(1)) * 12
    # fallback: numeric only
    n = parse_int(raw)
    return raw, n

def extract_monthly_payment(text):
    # look for forms: "monthly payment $420", "payment of $420 per month", "installment $420"
    patterns = [
        r'(?:monthly payment|monthly instalment|monthly installment|monthly lease payment)[^\d\$]{0,20}(\$?[0-9,]+(?:\.[0-9]+)?\s*[kK]?)',
        r'([Pp]ayment of|installment of|installment)[^\d\$]{0,20}(\$?[0-9,]+(?:\.[0-9]+)?\s*[kK]?)\s*(?:per month|/month|monthly)?',
        r'(\$[0-9,]+(?:\.[0-9]+)?\s*(?:per month|/month|monthly))'
    ]
    # patterns return group 1 or 2; unify by searching manually
    for p in patterns:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            # pick last capturing group with digits
            for g in reversed(m.groups()):
                if g and re.search(r'[0-9]', g):
                    raw = g
                    return raw, parse_amount(raw)
    return None, None

def extract_down_payment(text):
    patterns = [
        r'down payment[^\d\$]{0,20}(\$?[0-9,]+(?:\.[0-9]+)?\s*[kK]?)',
        r'deposit[^\d\$]{0,20}(\$?[0-9,]+(?:\.[0-9]+)?\s*[kK]?)'
    ]
    raw = find_first(patterns, text)
    return raw, parse_amount(raw) if raw else (None, None)

def extract_residual_value(text):
    patterns = [
        r'(?:residual value|residual)[^\d\$]{0,20}(\$?[0-9,]+(?:\.[0-9]+)?\s*[kK]?)',
        r'residual amount[^\d\$]{0,20}(\$?[0-9,]+)'
    ]
    raw = find_first(patterns, text)
    return raw, parse_amount(raw) if raw else (None, None)

def extract_mileage_and_overage(text):
    # mileage allowance per year
    mileage_patterns = [
        r'(\d{1,3}(?:,[0-9]{3})?|\d{1,2}k)\s*(?:miles per year|miles/year|mi/year|miles per annum|miles p[er]{0,2} year)',
        r'mileage allowance[^\d]{0,20}(\d{1,3}(?:,[0-9]{3})?|\d{1,2}k)'
    ]
    overage_patterns = [
        r'(\$[0-9]+(?:\.[0-9]+)?)\s*(?:per mile|/mile|per mi)',
        r'(?:overage|excess mileage|excess miles)[^\d\$]{0,20}(\$?[0-9]+(?:\.[0-9]+)?)'
    ]
    raw_m = find_first(mileage_patterns, text)
    raw_o = find_first(overage_patterns, text)
    # normalize mileage ('12k' -> 12000)
    def norm_mileage(s):
        if not s: return None
        s = s.lower().replace(',', '').strip()
        k = False
        if s.endswith('k'):
            k = True
            s = s[:-1]
        try:
            val = int(float(s))
            if k: val *= 1000
            return val
        except:
            return None
    return raw_m, norm_mileage(raw_m), raw_o, parse_amount(raw_o) if raw_o else (None, None)

def extract_buyout_price(text):
    patterns = [
        r'(?:buyout|purchase option|purchase price|purchase option price)[^\d\$]{0,30}(\$?[0-9,]+(?:\.[0-9]+)?\s*[kK]?)',
        r'payoff amount[^\d\$]{0,30}(\$?[0-9,]+)'
    ]
    raw = find_first(patterns, text)
    return raw, parse_amount(raw) if raw else (None, None)

# ------------------ Clause snippet extraction ------------------

def extract_clause_snippet(text, keywords, window=300):
    # return first found snippet around any of the keywords
    tiny = text.lower()
    for kw in keywords:
        i = tiny.find(kw.lower())
        if i != -1:
            start = max(0, i - 60)
            end = min(len(text), i + window)
            return text[start:end].strip()
    return None

# ------------------ Main wiring ------------------

if __name__ == "__main__":
    with open("ocr_output.txt", "r", encoding="utf-8") as fh:
        raw_text = fh.read()

    text = clean_text(raw_text)

    result = {}

    # APR
    raw_apr, apr_val = extract_apr(text)
    result['apr_raw'] = raw_apr
    result['apr_percent'] = apr_val

    # Lease term
    raw_term, term_months = extract_lease_term(text)
    result['lease_term_raw'] = raw_term
    result['lease_term_months'] = term_months

    # Monthly payment
    raw_monthly, monthly_val = extract_monthly_payment(text)
    result['monthly_payment_raw'] = raw_monthly
    result['monthly_payment'] = monthly_val

    # Down payment
    raw_down, down_val = extract_down_payment(text)
    result['down_payment_raw'] = raw_down
    result['down_payment'] = down_val

    # Residual
    raw_res, res_val = extract_residual_value(text)
    result['residual_value_raw'] = raw_res
    result['residual_value'] = res_val

    # Mileage & overage
    raw_mile, mile_val, raw_over, over_val = extract_mileage_and_overage(text)
    result['mileage_raw'] = raw_mile
    result['mileage_per_year'] = mile_val
    result['overage_raw'] = raw_over
    result['overage_per_mile'] = over_val

    # Buyout / purchase option
    raw_buy, buy_val = extract_buyout_price(text)
    result['buyout_raw'] = raw_buy
    result['buyout_price'] = buy_val

    # Clause snippets
    clauses = {}
    clauses['early_termination'] = extract_clause_snippet(text, ['early termination', 'termination', 'terminate'])
    clauses['maintenance'] = extract_clause_snippet(text, ['maintenance', 'service', 'maintain'])
    clauses['warranty'] = extract_clause_snippet(text, ['warranty', 'guarantee'])
    clauses['insurance'] = extract_clause_snippet(text, ['insurance', 'insurer', 'insured'])
    clauses['penalties'] = extract_clause_snippet(text, ['penalty', 'late fee', 'late payment', 'default'])

    # print result
    print("\n=== EXTRACTED (raw + normalized) ===\n")
    for k, v in result.items():
        print(f"{k:25} : {v}")
    print("\n=== CLAUSE SNIPPETS ===\n")
    for k, v in clauses.items():
        print(f"{k}:\n{v}\n")



=== EXTRACTED (raw + normalized) ===

apr_raw                   : 8.5%
apr_percent               : 8.5
lease_term_raw            : 36 months
lease_term_months         : 36
monthly_payment_raw       : 42,500 
monthly_payment           : 42500.0
down_payment_raw          : 150,000 
down_payment              : 150000.0
residual_value_raw        : 900,000 
residual_value            : 900000.0
mileage_raw               : 15,000
mileage_per_year          : 15000
overage_raw               : 312
overage_per_mile          : 312.0
buyout_raw                : None
buyout_price              : (None, None)

=== CLAUSE SNIPPETS ===

early_termination:
ty coverage throughout the lease term. 8. EARLY TERMINATI0N Early termination of this lease before the Lease End Date will result in: a) Payment of all outstanding lease payments b) An early termination fee of 75,000 c) Any applicable excess mileage or damage charges 9. PURCHASE 0PTI0N At the end of the lease term, the Lessee may purchase the vehicle 

##vehicle details extraction

In [1]:
import requests

vin = "enter vin number"
url = f"https://vpic.nhtsa.dot.gov/api/vehicles/decodevinvalues/{vin}?format=json"

data = requests.get(url).json()
car = data["Results"][0]

for key, value in car.items():
    value = str(value).strip()
    if value:
        print(f"{key}: {value}")


AdditionalErrorText: Invalid character(s): 6:_, 8:I, 9:N, 10:_, 14:B, 15:E, 16:R.
ErrorCode: 6,7,11,400
ErrorText: 6 - Incomplete VIN; 7 - Manufacturer is not registered with NHTSA for sale or importation in the U.S. for use on U.S roads; Please contact the manufacturer directly for more information; 11 - Incorrect Model Year - Position 10 does not match valid model year codes (I, O, Q, U, Z, 0). Decoded data may not be accurate.; 400 - Invalid Characters Present
SuggestedVIN: ENTER!V!!!NUM!!!
VIN: enter vin number
VehicleDescriptor: ENTER VI* N
