In [1]:
!pip install pdfplumber
!pip install faiss-cpu

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pdfplumber
import re
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import faiss
import json
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
def extractpdf(path, pgno):
  try:
    with pdfplumber.open(path) as pdf:
      if 0 <= pgno < len(pdf.pages):
        return pdf.pages[pgno].extract_text()
  except FileNotFoundError:
    return f"File not found: {path}"
  except Exception as e:
    return f"Error: {e}"
def extract_table(text):
    lines = text.splitlines()
    table_data = []
    in_table = False
    conclusive_phrases = [
        "for Chartered Accountant", "Managing Director", "Chief Financial Officer",
        "Company Secretary", "Director", "Chartered Accountants", "Membership No",
        "Firm’s Registration No", "As per our report"
    ]
    for line in lines:
        line = line.strip()
        if any(phrase.lower() in line.lower() for phrase in conclusive_phrases):
            break
        if not line or len(line.split()) > 10:
            continue
        if any(char.isdigit() for char in line):
            in_table = True
        if in_table:
            table_data.append(line)
    return "\n".join(table_data)
def convert_to_float(s):
    s = s.replace(',', '')
    if '(' in s and ')' in s:
        s = s.replace('(', '-').replace(')', '')
    return float(s)
def extract_struct_data(text):
    def extract_key_and_values(line, prefix):
        pattern_1 = r'\b[A-Za-z\s\:\-,₹()]+\b'
        pattern_2 = r'\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?'
        key = prefix + re.findall(pattern_1, line)[0]
        nums = re.findall(pattern_2, line)
        nums = [convert_to_float(num.strip()) for num in nums]
        return key, nums

    struct_data = {}
    prefix = ''
    for line in text.split('\n'):
        line = line.strip()
        if re.search(r'\b[A-Za-z\s\:\-,₹()]+(?:\s+\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?){4,5}\b', line):
            key, nums = extract_key_and_values(line, prefix)
            struct_data[key] = nums
        elif re.search(r'^\s*(\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?(?:\s+\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?)*\s*)$', line):
            values = re.findall(r'\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?', line)
            values = [convert_to_float(value) for value in values]
            key = f'subtotal-{prefix}'
            struct_data[key] = values
        elif re.search(r'^[A-Za-z\s&%,./:\-]+$', line):
            match = re.match(r'^[A-Za-z\s&%,./:\-]+$', line)
            if match and len(match.group(0).split()) <= 5:
                prefix = match.group(0) + '-'
    return struct_data
def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())
    for k, v in data.items():
        if len(v) < max_len:
            v.insert(0,0)
            data[k]=v
    return data
def convdf(dic,ind):
  my_df=pd.DataFrame(dic)
  my_df=my_df.set_index([pd.Index(ind)])
  return my_df
def negval(df1):
    for col in df1.columns:
        df1[col] = df1[col].apply(lambda x: abs(x) if x < 0 else x)
    return df1
def df_to_dict(dataframe):
    data = []
    for i in dataframe.index:
        for j in dataframe.columns:
            description = f'{j}, {i}: {dataframe.loc[i, j]}'
            metadata = {'row': i, 'col': j, 'value': dataframe.loc[i, j]}
            data.append({"text": description, "metadata": metadata})
    return data
def add_embeddings_to_data(data):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    for item in data:
        item['embedding'] = model.encode(item['text'])
    return data
def create_faiss_index(data, index_file='index.faiss', metadata_file='metadata.json'):
    embeddings = np.array([i['embedding'] for i in data]).astype('float32')
    metadata = [i['metadata'] for i in data]
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, index_file)
    with open(metadata_file, "w") as f:
        json.dump(metadata, f)
def load_index_and_metadata(index_file='index.faiss', metadata_file='metadata.json'):
    index = faiss.read_index(index_file)
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    return index, metadata
def retrieve_context(query, k=5, index_file='index.faiss', metadata_file='metadata.json'):
    index, metadata = load_index_and_metadata(index_file, metadata_file)
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = embedding_model.encode(query).astype('float32').reshape(1, -1)
    distance, indice = index.search(query_embedding, k)
    results = metadata[indice[0][0]]
    return results
def generate_answer(query, context):
    generate_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
    generate_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
    context_text = "\n".join([f"{context['row']},{context['col']},{context['value']}"])
    prompt = f"Based on the provided context, write a detailed and fact-based answer of around 30 words to address the following question. Be specific, concise, and focus on the key financial details.\n\nContext: {context_text}\n\nQuestion: {query}\n\nAnswer:The [col from the context] for the [row from the context] is [value from the context].Replace [] accordingly with the value from the context "
    ip = generate_tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True)
    answer = generate_model.generate(ip.input_ids, max_length=150, num_return_sequences=1)
    return generate_tokenizer.decode(answer[0], skip_special_tokens=True)
def rag_pipeline(query):
    context = retrieve_context(query)
    answer = generate_answer(query, context)
    return answer
def filter_index(df):
  print(df.index)
  r=int(input("Enter the number of index you want to filter:"))
  for i in range(r):
    ind=input("Enter the index:")
    df=df.drop(ind)
  return df

In [3]:
class pipeline():
  def processing(self,pdffile,pgno):
    extracttext=extractpdf(pdffile,pgno)
    tablecontents=extract_table(extracttext)
    structureddata=extract_struct_data(tablecontents)
    paddeddata=pad_dict_values(structureddata)
    df=convdf(paddeddata,['note','Quarterly of 2024','Quarterly of 2023','Yearly of 2024','Yearly of 2023'])
    df=filter_index(df)
    df=negval(df)
    data=df_to_dict(df)
    embedded_data=add_embeddings_to_data(data)
    create_faiss_index(embedded_data)
  def rag_pipeline(self,query):
    load_index_and_metadata()
    return rag_pipeline(query)
pl=pipeline()
pl.processing('Sample Financial Statement.pdf',2)
print(pl.rag_pipeline("how much is the total tax for Quarterly 2023"))

Index(['note', 'Quarterly of 2024', 'Quarterly of 2023', 'Yearly of 2024',
       'Yearly of 2023'],
      dtype='object')
Enter the number of index you want to filter:1
Enter the index:note


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Tax expense:-Current tax ,2260.0


In [4]:
print(pl.rag_pipeline("what is the total income for the Yearly of 2023"))

Total income ,149468.0


In [5]:
print(pl.rag_pipeline("How much revenue is generated in 2023?"))

Revenue from operations is 146767.0.


In [6]:
print(pl.rag_pipeline("How much TravelExpenses is done in Yearly of 2023?"))

Travel expenses ,1525.0


In [7]:
print(pl.rag_pipeline("please say me the expenses for Quarterly of 2023"))

Expenses-Other expenses ,1146.0
