In [107]:
import configparser
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
import requests
from io import BytesIO
from pinecone import Pinecone
from litellm import completion
import litellm
import os
from transformers import AutoTokenizer

config = configparser.ConfigParser()
config.read('configuration.properties')

['configuration.properties']

In [98]:
def pinecone_connection():
    try:
        pinecone_api_key = config['PINECONE']['pinecone_api_key']
        index_name = config['PINECONE']['index']
        return pinecone_api_key, index_name
    except Exception as e:
        print("Exception in pinecone_connection function: ",e)
        return "", ""

In [99]:
def read_pdf_data(pdf_link):
    try:
        response = requests.get(pdf_link)
        if response.status_code == 200:
            pdf_bytes = BytesIO(response.content)

            # Step 2: Read and extract text
            reader = PdfReader(pdf_bytes)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        else:
            print("Failed to download the PDF.")
            text = ""
        return text
    except Exception as e:
        print("Exception in read_pdf_data function: ",e)
        return ""

In [None]:
def chunk_text(text, chunk_size=512):
    try:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

        # Tokenize the entire text
        tokens = tokenizer.encode(text, truncation=False, add_special_tokens=False)

        # Split tokens into chunks of max `chunk_size`
        token_chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

        # Decode token chunks back to text
        text_chunks = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in token_chunks]

        return text_chunks
    except Exception as e:
        print("Exception in chunk_text function:", e)
        return []

In [101]:
def summarize_text(prompt, text):
    try:
        # Log API base URL
        api_base = config['OLLAMA']['OLLAMA_API_BASE']

        # Test connection first
        conn_test = requests.get(f"{api_base}/api/version")

        # Hardcoded test query
        test_message = f"{prompt} {text}"

        # Make request to Ollama with more detailed logging
        response = completion(
            model="ollama/llama3.2:1b",  # Updated model name to match available model
            messages=[{"role": "user", "content": test_message}],
            api_base=api_base,
            temperature=0.7,
            request_timeout=30
        )

        return response.choices[0].message.content
    except Exception as e:
        print("Exception in summarize_text function: ",e)
        return ""

In [121]:
async def embedding(final_summary):
    try:

        response = litellm.embedding(
            model="ollama/nomic-embed-text",  # Ollama model
            # Log API base URL
            api_base = config['OLLAMA']['OLLAMA_API_BASE'], # Ollama API base
            input=final_summary
        )

        return response.data[0]["embedding"]

    except Exception as e:
        print("Exception in embedding function: ",e)
        return []

In [None]:
def upsert_to_pinecone(embedded_vector):
    pinecone_api_key, index_name = pinecone_connection()
    print("Pinecone API Key: ", pinecone_api_key)
    print("Index Name: ", index_name)
    if pinecone_api_key == "" or index_name == "":
        return None

    pinecone = Pinecone(api_key=pinecone_api_key)
    index = pinecone.Index(name=index_name)

from embed.connections import snowflake_connection, pinecone_connection, aws_connection
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd
from io import BytesIO, StringIO
import numpy as np


def fetch_table_from_s3(csv_key):
    '''
    function to fetch latest(past 24 hours) jobs data from S3
    '''
    try:
        s3_client, bucket_name = aws_connection()
        print("AWS connection established. Fetching data")

        # fetching raw csv
        response = s3_client.get_object(Bucket=bucket_name, Key=f"validated_jobs/{csv_key}")
        csv_obj_content = response['Body'].read()
        
        # file like object creation
        pdfFileObj = BytesIO(csv_obj_content)
        pdf_df = pd.read_csv(pdfFileObj, sep="|")
        pdf_df = pdf_df.replace(np.nan, None)
        print("CSV fetched. Length of data: ", len(pdf_df))

        return pdf_df

    except Exception as e:
        print("Exception in fetch_data_from_snowflake function", e)
        return ''


def storing_pinecone(csv_key):
    '''
    function to store set a in pinecone
    '''
    try:
        df = fetch_table_from_s3(csv_key)
        df.drop(columns=['min_salary', 'max_salary', 'employment_type', 'source'], inplace=True)
        print(df.head())
        
        # Pinecone
        pinecone_api_key, index_name = pinecone_connection()
        pinecone = Pinecone(api_key=pinecone_api_key)
        index = pinecone.Index(name=index_name)

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if device != 'cuda':
            model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

        all_embeddings = []

        # iterating over pandas dataframe
        print("Generating embeddings..")
        for _, row in df.iterrows():
            _id = str(row['job_id'])
            job_title = row['job_title']
            company = row['company']
            location = row['job_location']
            url = row['job_url']
            date_posted = row['date_posted']
            description = row['job_desc']

            # Concatenating all relevant text data for embedding with new lines
            full_text = f"{job_title}\n{company}\n{location}\n{url}\n{date_posted}\n{description}"
            display_text = f"job_title: {job_title}\ncompany: {company}\nlocation: {location}\nurl: {url}\ndate_posted: {date_posted}"

            # embedding data
            embedding = model.encode(full_text)

            print(f"Embedded job for id: {_id}")

            embedding_data = {
                "id": _id,
                "values": embedding,
                "metadata": {
                    "id": _id,
                    "file_name": 'jobmatch_data.csv',
                    "text": display_text
                }
            }

            # embedding question and answer separately
            all_embeddings.append(embedding_data)
        print(len(embedding))
        print("Embeddings generated")

        # upserting the embeddings to pinecone namespace
        index.upsert(all_embeddings)

        return "successful"

    except Exception as e:
        print("Exception in storing_pinecone() function: ", e)
        return "failed"


In [102]:
pdf_link = 'https://public-inspection.federalregister.gov/2025-00479.pdf?1736516733'
text = read_pdf_data(pdf_link)

In [None]:
chunks = chunk_text(text, chunk_size=512)
summarized_text_chunk = ""

In [105]:
chunk_summzrize_prompt = "Below is the pdf content of the regulation update by goverment agencies in food and agriculture industry. Can you summarize the important content that can affect the small scale food businesses? In summary, keep the important content like agency name and numbers and remove the unnecessary details."
final_summary_prompt = "Can you strictly summarize the important content from the below summarized text? Keep important content like agency name and numbers and remove the unnecessary details."

for _,text in enumerate(chunks):
    summarized_text_chunk += summarize_text(chunk_summzrize_prompt, text)

print("-----------------")
print("Final Summary:")
final_summary = summarize_text(final_summary_prompt, summarized_text_chunk)
print(final_summary)

-----------------
Final Summary:
There are several important updates to reimbursement rates for the Summer Food Service Program (SFSP), which provides financial assistance to food service providers who serve children in low-income families. Here's a summary of the key changes:

**Reimbursement Rate Increases**

* The annual increases will take effect starting January 1, 2024.
* For meal periods that are combined and individually authorized by the Food Away from Home series of the Consumer Price Index (CPI), the rate adjustments will be applied to each component separately.

**Breakdown of Reimbursement Rates for SFSP**

| Meal Period | Reimbursement Rate |
| --- | --- |
| Breakfast | $3.09-$3.03 per meal, or $8.08-$8.61 per meal (up from $2.78-$2.83) |
| Lunch/Supper | $5.00-$5.31 per meal, or $8.76-$8.77 per meal (up from $4.50-$4.63) |
| Snack | $1.28-$1.25 per snack, or $2.02-$2.06 per snack (no change) |

**Reimbursement Rate Changes for SFSP Providers**

* Alaska: Rural or self-pr

In [122]:
embedded_vector = await embedding(final_summary)


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in embedding function:  litellm.APIConnectionError: asyncio.run() cannot be called from a running event loop
Traceback (most recent call last):
  File "d:\Old Laptop\Hackathon Dev\Civic Hackathon\civic_ai\venv\lib\site-packages\litellm\main.py", line 3703, in embedding
    response = ollama_embeddings_fn(  # type: ignore
  File "d:\Old Laptop\Hackathon Dev\Civic Hackathon\civic_ai\venv\lib\site-packages\litellm\llms\ollama\completion\handler.py", line 92, in ollama_embeddings
    return asyncio.run(
  File "C:\Users\raisi\AppData\Local\Programs\Python\Python310\lib\asyncio\runners.py", line 33, in run
    raise RuntimeError(
RuntimeError: asyncio.run() cannot be called from a running event loop

