In [20]:
import requests

def download_pdf(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(response.content)
    return filename

# Example usage
url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
filename = "downloaded_pdf.pdf"
download_pdf(url, filename)


'downloaded_pdf.pdf'

In [21]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF
    text = ""
    for page in doc:
        text += page.get_text("text")  # Extract text from each page
    return text

# Extract text from the downloaded PDF
pdf_text = extract_text_from_pdf(filename)
print(pdf_text[:1000])  # Print the first 1000 characters of extracted text


Tables, Charts, and 
Graphs 
with Examples from History, Economics, 
Education, Psychology, Urban Affairs and 
Everyday Life
REVISED: MICHAEL LOLKUS 2018
Tables, Charts, and 
Graphs Basics
We use charts and graphs to visualize data.  
This data can either be generated data, data gathered from 
an experiment, or data collected from some source.
A picture tells a thousand words so it is not a surprise that 
many people use charts and graphs when explaining data.
Types of Visual 
Representations of Data
Table of Yearly U.S. GDP by 
Industry (in millions of dollars)
Year
2010
2011
2012
2013
2014
2015
All Industries
26093515
27535971
28663246
29601191
30895407
31397023
Manufacturing
4992521
5581942
5841608
5953299
6047477
5829554
Finance,
Insurance, Real 
Estate, Rental, 
Leasing
4522451
4618678
4797313
5031881
5339678
5597018
Arts, 
Entertainment, 
Recreation, 
Accommodation,
and Food Service
964032
1015238
1076249
1120496
1189646
1283813
Other
15614511
16320113
16948076
17495515
183186

In [22]:
def chunk_text(text, chunk_size=500):
    """Chunk the text into smaller pieces."""
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(chunks)}")


Number of chunks: 11


In [23]:
from sentence_transformers import SentenceTransformer

def embed_text_chunks(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight transformer model
    embeddings = model.encode(chunks)
    return embeddings

embeddings = embed_text_chunks(chunks)
print(f"Embedding shape: {embeddings.shape}")


Embedding shape: (11, 384)


In [24]:
import faiss
import numpy as np

def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Index for L2 similarity
    index.add(np.array(embeddings))  # Add embeddings to the index
    return index

index = create_faiss_index(embeddings)


In [25]:
def search_query(query, index, model, top_k=3):
    query_embedding = model.encode([query])
    _, indices = index.search(np.array(query_embedding), top_k)
    return indices

query = "What is the unemployment rate based on degree?"
relevant_indices = search_query(query, index, model)
print(f"Top matching chunk indices: {relevant_indices}")


Top matching chunk indices: [[2 1 5]]


In [26]:
def compare_data(data1, data2):
    # Assuming you have extracted relevant data (could be unemployment rates or table rows)
    comparison = {
        "Degree": [data1["degree"], data2["degree"]],
        "Unemployment Rate": [data1["unemployment_rate"], data2["unemployment_rate"]],
    }
    return comparison

# Example usage
data1 = {"degree": "Bachelor's", "unemployment_rate": 5.3}
data2 = {"degree": "Master's", "unemployment_rate": 3.8}
comparison_result = compare_data(data1, data2)
print(comparison_result)


{'Degree': ["Bachelor's", "Master's"], 'Unemployment Rate': [5.3, 3.8]}


In [None]:
import openai

# Make sure you have your OpenAI API key set
openai.api_key = "Seceret_API_Key"

def generate_response(retrieved_data, query):
    # Construct the prompt using the retrieved data and the user's query
    prompt = f"Answer the following question using the provided data: {query}\n\n{retrieved_data}"
    
    # Send the prompt to OpenAI's GPT model
    response = openai.Completion.create(
        engine="text-davinci-003",  # You can use other engines like "gpt-4" if available
        prompt=prompt,
        max_tokens=200  # Limit the length of the response
    )
    
    # Return the text generated by the model
    return response.choices[0].text.strip()

# Example usage
query = "What is the unemployment rate based on degree?"
retrieved_data = "Bachelor's degree has an unemployment rate of 5.3%."

response = generate_response(retrieved_data, query)
print(response)


In [None]:
import openai

openai.api_key = "Secret_API_Key"
try:
    # Make a simple request to check the API key validity
    models = openai.Model.list()  # List available models
    print("API Key is valid. Models available:", models)
except Exception as e:
    print("Error:", e)
