In [2]:
import os
import snowflake.connector
from pdfminer.high_level import extract_text

# Snowflake connection configuration
SNOWFLAKE_CONFIG = {
    "user": "ssalvi",
    "password": "Suyash!1998",
    "account": "jzsqwus-ywb74626",
    # "warehouse": "YOUR_WAREHOUSE",
    "database": "LEARNING_ASSISTANT",
    "schema": "PUBLIC",
}

# Path to your PDFs folder
PDF_FOLDER = "../content/pdfs/"

# Connect to Snowflake
def connect_to_snowflake():
    return snowflake.connector.connect(**SNOWFLAKE_CONFIG)

# Extract text from a PDF
def extract_text_from_pdf(pdf_path):
    try:
        return extract_text(pdf_path)
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

# Load PDF data into Snowflake
def load_pdfs_to_snowflake():
    conn = connect_to_snowflake()
    cursor = conn.cursor()

    try:
        files = os.listdir(PDF_FOLDER)
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(PDF_FOLDER, file)
                content = extract_text_from_pdf(file_path)
                
                # Insert into Snowflake
                cursor.execute("""
                    INSERT INTO educational_content (id, title, content)
                    VALUES (%s, %s, %s)
                """, (file, file.replace(".pdf", ""), content))
                print(f"Uploaded: {file}")
    finally:
        cursor.close()
        conn.close()

if __name__ == "__main__":
    load_pdfs_to_snowflake()

Uploaded: Redis Microservices.pdf
Uploaded: Analytics Stack Guidebook.pdf
Uploaded: Node.js Design Patterns 3rd Edition.pdf
Uploaded: Redis For Dummies Limited Edition.pdf
Uploaded: Caching at Scale with Redis Dec 2021.pdf
Uploaded: Caching for Microservices Brief.pdf
Error extracting text from ../content/pdfs/Practical Node.js 2nd Edition.pdf: unpack requires a buffer of 30 bytes
Uploaded: Practical Node.js 2nd Edition.pdf
Uploaded: Designing Data-Intensive Applications.pdf
Uploaded: Hands On JavaScript High Performance.pdf
Uploaded: Microservices Building Scalable Software.pdf
Uploaded: Node.js High Performance.pdf
Uploaded: Mastering Node.js.pdf
Uploaded: Object Oriented Design Course Notes.pdf
Uploaded: System Design Interview An Insider’s Guide by Alex Xu (z-lib.org).pdf
Uploaded: oreilly-technical-guide-understanding-etl.pdf
Uploaded: Mastering Kubernetes 3rd Edition.pdf


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
import os
from pdfminer.high_level import extract_text
import snowflake.connector
import json

# Snowflake Configuration
SNOWFLAKE_CONFIG = {
    "user": "ssalvi",
    "password": "Suyash!1998",
    "account": "jzsqwus-ywb74626",
    # "warehouse": "YOUR_WAREHOUSE",
    "database": "LEARNING_ASSISTANT",
    "schema": "PUBLIC",
}

# Initialize embedding model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings[0].numpy()

# Extract text and generate embeddings for each PDF
PDF_FOLDER = "../content/pdfs/"
# def process_pdfs():
#     conn = snowflake.connector.connect(**SNOWFLAKE_CONFIG)
#     cursor = conn.cursor()

#     files = os.listdir(PDF_FOLDER)
#     for file in files:
#         if file.endswith(".pdf"):
#             file_path = os.path.join(PDF_FOLDER, file)
#             content = extract_text(file_path)

#             # Generate embedding
#             embedding = generate_embedding(content)

#             # Insert into Snowflake
#             cursor.execute("""
#                 INSERT INTO educational_content (id, title, content, embedding)
#                 VALUES (%s, %s, %s, %s)
#             """, (file, file.replace(".pdf", ""), content, embedding.tolist()))

#     cursor.close()
#     conn.close()

import base64
import numpy as np

def extract_text_from_pdf(pdf_path):
    try:
        return extract_text(pdf_path)
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

def process_pdfs():
    conn = snowflake.connector.connect(**SNOWFLAKE_CONFIG)
    cursor = conn.cursor()

    try:
        files = os.listdir(PDF_FOLDER)
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(PDF_FOLDER, file)
                content = extract_text_from_pdf(file_path)
                
                # Skip if content extraction failed
                if content is None:
                    print(f"Skipping {file} due to extraction error")
                    continue

                try:
                    # Generate embedding and convert to compressed binary
                    embedding = generate_embedding(content)
                    embedding_bytes = base64.b64encode(embedding.tobytes()).decode('utf-8')

                    # Insert into Snowflake
                    cursor.execute("""
                        INSERT INTO educational_content (id, title, content, embedding)
                        VALUES (%s, %s, %s, %s)
                    """, (file, file.replace(".pdf", ""), content, embedding_bytes))
                    print(f"Processed: {file}")
                except Exception as e:
                    print(f"Error processing {file}: {e}")
                    continue
    finally:
        cursor.close()
        conn.close()
process_pdfs()

Processed: Redis Microservices.pdf
Processed: Analytics Stack Guidebook.pdf
Processed: Node.js Design Patterns 3rd Edition.pdf
Processed: Redis For Dummies Limited Edition.pdf
Processed: Caching at Scale with Redis Dec 2021.pdf
Processed: Caching for Microservices Brief.pdf
Error extracting text from ../content/pdfs/Practical Node.js 2nd Edition.pdf: unpack requires a buffer of 30 bytes
Skipping Practical Node.js 2nd Edition.pdf due to extraction error
Processed: Designing Data-Intensive Applications.pdf
Processed: Hands On JavaScript High Performance.pdf
Processed: Microservices Building Scalable Software.pdf
Processed: Node.js High Performance.pdf
Processed: Mastering Node.js.pdf
Processed: Object Oriented Design Course Notes.pdf
Processed: System Design Interview An Insider’s Guide by Alex Xu (z-lib.org).pdf
Processed: oreilly-technical-guide-understanding-etl.pdf
Processed: Mastering Kubernetes 3rd Edition.pdf
