In [3]:
# contact rhand7@illinois.edu for questions or concerns
# https://github.com/RobHand27/Wiki-Generator
import requests
from google import genai
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from serpapi.google_search import GoogleSearch
import os
import io

In [4]:
#initializes gemini api key
load_dotenv("/Users/roberthand/Desktop/FDL/.env")  
Google_api_key = os.getenv('API_KEY')
client = genai.Client(api_key=Google_api_key)

# use free proxy generator to avoid google scholar ip block, isnt required for this implementation (I think...)
# from fp.fp import FreeProxy
# proxy = FreeProxy(rand=True, timeout=1, country_id=['US'], anonym=True).get()
# print(proxy)

In [7]:
# function searches for a pdf and returns the link(s)
def search_for_pdf(query: str, num_results: int = 1):
    params = {
        "engine": "google",
        "q": f"{query} filetype:pdf",
        "api_key": os.getenv('SERP_API_KEY')
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    toreturn = []
    for result in results.get("organic_results", []):
        if len(toreturn) >= num_results:
            break # we have enough results
        link = result.get("link", "")
        if link.endswith(".pdf"):
            print(f"Found PDF: {link}")
            toreturn.append(link)

    return toreturn

# turns the pdf into text by reading in the pdf
def get_pdf_text(pdf_urls: list):
    documents = []
    for pdf_url in pdf_urls:
        try:
            r = requests.get(pdf_url, timeout=500)
            if r.status_code != 200 or "pdf" not in r.headers.get("Content-Type", ""): # check if the pdf is valid
                print(f"Skipping invalid PDF: {pdf_url}")
                continue
            
            f = io.BytesIO(r.content)
            reader = PdfReader(f)
            text = []
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text.append(page_text)
            
            if text:
                documents.append("\n".join(text))
            else:
                print(f"Warning: No text extracted from {pdf_url}")
        
        except Exception as e:
            print(f"Error processing {pdf_url}: {str(e)}")

    return documents 

#tests
a = search_for_pdf("covid-19", 5)
b = get_pdf_text(a)
# print(a)
print(b)
print(len(b))

Found PDF: https://www.cdc.gov/vaccines/hcp/current-vis/downloads/covid-19.pdf
Found PDF: https://www.covid.gov/sites/default/files/documents/Services-and-Supports-for-Longer-Term-Impacts-of-COVID-19-08012022.pdf
Found PDF: https://www.osha.gov/sites/default/files/publications/OSHA3990.pdf
Found PDF: https://www.gov.ca.gov/wp-content/uploads/2022/01/COVID-19-Budget-Fact-Sheet-1.8.22.pdf
Found PDF: https://www.cms.gov/files/document/medicare-covid-19-data-snapshot-fact-sheet.pdf
Skipping invalid PDF: https://www.osha.gov/sites/default/files/publications/OSHA3990.pdf
4


In [1]:
# splitting generation prompt into separate prompts may reduce hallucinations. 
# could also move generation prompt into a separate txt file for readability and style

def generate_article(input: str, num_articles: int = 5):
    generation_prompt = f'''
    You will be provided with content from some articles about {input}.
    Your goal will be to summarize the articles following the schema provided.
    You must cite your sources in an APA format.
    You must only use knowledge from the articles provided. Pretend as if you have no prior knowledge of the topic.
    Here is a description of the parameters:
    - summary: what is {input}? Explain what {input} is in simple terms
    - background: how and when was {input} created? What was the motivation behind its creation?
    - how_it_works: how does {input} work? What are the main components of {input}?
    - types: what are the different types of {input}?
    Feel free to add up to 2 of your own parameters based on the content you read, however you must always use the 4 provided.
    Here are said articles.''' # remove this line when better integrated

    links = search_for_pdf(input, num_articles)
    articles = get_pdf_text(links)
    formatted_articles = "\n\n".join([f"Article {i+1}:\n{article}" for i, article in enumerate(articles)])
    response = client.models.generate_content(
        model="gemini-2.0-flash", # this is the best model I could get to work for free
        contents=generation_prompt + formatted_articles
    )
    print(response.text)
    print(len(links))


In [2]:
generate_article("Quantum Computing", 20)
# see covid 19 example for complete output without running the code

NameError: name 'search_for_pdf' is not defined

In [5]:
# gemeni test cell

response = client.models.generate_content(
        model="gemini-2.0-flash", # this is the best model I could get to work for free
        contents= "Hello, how are you today?"
    )
print(response.text)

I am doing well, thank you for asking! How are you today?

