In [1]:
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor

In [None]:
%pip install PyPDF2

from tqdm import tqdm
import concurrent
import PyPDF2

In [None]:
%pip install pandas

import os
import pandas as pd
import base64

In [5]:
from dotenv import load_dotenv, find_dotenv
#_ = load_dotenv(find_dotenv()) # read local .env file
load_dotenv('openai.env')
api_key =  os.environ.get('OPENAI_API_KEY')
#client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client = OpenAI(api_key=api_key)

In [6]:
dir_pdfs = 'input-data'
pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]

In [8]:
def upload_single_pdf(file_path: str, vector_store_id: str):
    file_name = os.path.basename(file_path)
    try:
        file_response = client.files.create(file=open(file_path,'rb'),purpose="assistants")
        attach_response = client.vector_stores.files.create(
            vector_store_id = vector_store_id,
            file_id=file_response.id
        )
        return {"file":file_name, "status":"success"}
    except Exception as e:
        print(f"Error with {file_name}: {str(e)}")
        return {"file":file_name, "status": "failed", "error":str(e)}

In [9]:
def upload_pdf_files_to_vector_store(vector_store_id: str):
    pdf_files = [os.path.join(dir_pdfs,f) for f in os.listdir(dir_pdfs)]
    stats = {"total_files": len(pdf_files), "successful_uploads":0, "failed_uploads": 0, "errors": []}

    print(f"{len(pdf_files)} PDF files to process. Uploading in parallel...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(upload_single_pdf, file_path, vector_store_id): file_path for file_path in pdf_files}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
            result = future.result()
            if result["status"] == "success":
                stats["successful_uploads"] += 1
            else:
                stats["failed_uploads"] += 1
                stats["errors"].append(result)

    return stats

In [10]:
def create_vector_store(store_name:str) -> dict:
    try:
        vector_store = client.vector_stores.create(name=store_name)
        details = {
            "id": vector_store.id,
            "name": vector_store.name,
            "created_at": vector_store.created_at,
            "file_count": vector_store.file_counts.completed
        }
        print("Vector store created:", details)
        return details
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return {}

        

In [11]:
store_name = "openai_blog_store"
vector_store_details = create_vector_store(store_name)
upload_pdf_files_to_vector_store(vector_store_details["id"])

Vector store created: {'id': 'vs_6877db4d88888191a0847257286b6989', 'name': 'openai_blog_store', 'created_at': 1752685389, 'file_count': 0}
1 PDF files to process. Uploading in parallel...


100%|██████████| 1/1 [00:02<00:00,  2.13s/it]


{'total_files': 1, 'successful_uploads': 1, 'failed_uploads': 0, 'errors': []}

In [12]:
query = "What is geneative AI?"
search_results = client.vector_stores.search(
    vector_store_id=vector_store_details['id'],
    query=query
)

In [None]:
for result in search_results.data:
    print(str(len(result.content[0].text)) + ' of character of content from ' + result.filename + ' with a relevant score of ' + str(result.score)) 

In [14]:
response = client.responses.create(
    input=query,
    model="gpt-4.1-nano",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

In [15]:
# Extract annotations from the response
annotations = response.output[0].content[0].annotations

# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
print(response.output[0].content[0].text)  # 0 being the filesearch call

Files used: set()
Response:
Generative AI refers to a type of artificial intelligence designed to create new content, such as text, images, music, or other data, that is similar to human-produced content. It uses models like neural networks to learn patterns from existing data and then generate new, original output based on that learning . Do you want a more detailed explanation or examples of generative AI?


In [None]:
%pip install langchain-community

In [17]:
#pdf_path ="input-data\introduction-to-genai.pdf"
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text
    

In [None]:
from pydantic import BaseModel, Field
from instructor import OpenAISchema
import instructor
from langchain_openai import ChatOpenAI  # <-- Add this import
from langchain_text_splitters import RecursiveCharacterTextSplitter

class Flashcard(BaseModel):
    question: str = Field(..., description="Challenging question about the content")
    answer: str = Field(..., description="Compact answer explaining the concept")
    #answer: str = Field(..., description="Detailed answer explaining the concept")
    #difficulty: int = Field(..., ge=1, le=5, description="Difficulty rating 1-5")
all_flashcards = []

def generate_flashcards(pdf_path: str) -> list[Flashcard]:
    text = extract_text_from_pdf(pdf_path)

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
    )

    chunks = text_splitter.split_text(text)
    
    for chunk in chunks:
        try:
            flashcards = generate_cards(chunk)
            flashcard_tuple = (flashcards.question, flashcards.answer)
            all_flashcards.append(flashcard_tuple)
        except Exception as e:
            print(f"Error generating flashcards for chunk: {e}")
            if "insufficient_quota" in str(e):
                print("You have exceeded your OpenAI API quota. Please check your plan and billing details.")
            break  # Stop further processing if quota is exceeded
    return all_flashcards

def generate_cards(chunk: str) -> Flashcard:
    client = instructor.patch(OpenAI())
    return client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[{
            "role": "user",
            "content": f"Generate flashcards from this text: {chunk}"
        }],
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_details['id']],
        }],
        response_model=Flashcard,
        max_retries=2
    )


In [21]:
flashcards_dict = {}
for pdf_path in pdf_files:
    flashcards = generate_flashcards(pdf_path)
    #print(all_flashcards)
    flashcards_dict[os.path.basename(pdf_path)] = flashcards

In [24]:
import pandas as pd
for item in flashcards_dict:
    # If tuple (question, answer), extract by index
    df = pd.DataFrame([{"Question": fc[0], "Answer": fc[1]} for fc in flashcards_dict[item]])
    print(f"Generated {len(df)} flashcards:\n")
    print(df.sample(min(3, len(df))).to_markdown())

    # Save to JSON
    df.to_json("flashcards_new.json", orient="records", indent=2, lines=False)

Generated 12 flashcards:

|    | Question                                                                   | Answer                                                                                                                                                         |
|---:|:---------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 11 | What are AI chatbots and give some examples?                               | AI chatbots are computer programs that simulate human conversation, with examples including Google Bard and Microsoft CoPilot.                                 |
|  4 | What was the increase in generative AI models from late 2022 to June 2023? | Generative AI increased from just under 2,000 in late 2022 to nearly 14,000 by June 2023.                                                                  

In [23]:
from pprint import pprint

pprint(flashcards_dict)

{'introduction-to-genai.pdf': [('What is Generative AI and when did it gain '
                                'prominence?',
                                'Generative AI, or Gen AI, is a relatively new '
                                'form of AI that became prominent in late 2022 '
                                'with the release of ChatGPT, a sophisticated '
                                'AI chatbot.'),
                               ('What significant role did AI chatbots like '
                                'ChatGPT, Google Bard, and Microsoft Copilot '
                                'released in 2023 play in public awareness?',
                                'They exposed the power and potential of AI to '
                                'corporate businesses and the general public.'),
                               ('What are some outputs and tasks that '
                                'generative AI foundation models can achieve?',
                                'Out