# Populating Pinecone vector database with pdf data and databse entries

In [1]:
import os
from dotenv import load_dotenv
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents.base import Document
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Index, Pinecone
from typing import List
import sqlite3

load_dotenv()
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv("OPENAI_API_KEY")

  from tqdm.autonotebook import tqdm


## Loading pdfs

In [2]:
load_dotenv
def get_text_from_pdf(pdf_file: str) -> List[Document]:
    """
    Extracts text from a PDF file and returns it as a list of Document objects.

    Args:
        pdf_file (str): Path to the PDF file to be processed.

    Returns:
        List[Document]: A list of Document objects, where each represents a page in the PDF.
    """
    # Initialize the PyMuPDFLoader with the given PDF file
    loader = PyMuPDFLoader(pdf_file)

    # Initialize an empty list to store the pages
    pages: List[Document] = []

    # Iterate over each loaded page and add it to the list
    for page in loader.load():
        pages.append(page)

    # Return the list of extracted pages
    return pages

In [5]:
pdf_files = [f for f in os.listdir("pdfs") if f.endswith(".pdf")]
pdf_files

['13.1_GreenbergSafran_1984_EmotionPsychotherapy.pdf',
 'About Squeak to Speak.pdf',
 'Dangerous routines alternatives.pdf',
 'Science-of-Emotion-Guide-UWA.pdf']

In [20]:
# Initialize a list to store all extracted documents
docs: List[Document] = []

# Extract text from each PDF file and add to the docs list
for pdf_file in pdf_files:
    docs.extend(get_text_from_pdf(pdf_file))

docs

[Document(metadata={'source': '13.1_GreenbergSafran_1984_EmotionPsychotherapy.pdf', 'file_path': '13.1_GreenbergSafran_1984_EmotionPsychotherapy.pdf', 'page': 0, 'total_pages': 11, 'format': 'PDF 1.4', 'title': 'amp44010019.tif', 'author': '', 'subject': '', 'keywords': '', 'creator': 'amp44010019.tif', 'producer': 'PageGenie PDFGenerator', 'creationDate': 'Sat Sep 22 10:00:58 2001', 'modDate': "D:20041004110424-04'00'", 'trapped': ''}, page_content='Emotion in Psychotherapy \nLeslie S. Greenberg \nJeremy D. Safran \nYork University \nClarke Institute of Psychiatry \nABSTRACT." The therapeutic process involves many dif- \nferent types of affective phenomena. No single therapeutic \nperspective has been able to encompass within its own \ntheoretical framework all the ways in which emotion plays \na role in therapeutic change. A comprehensive, constructive \ntheory of emotion helps transcend the differences in the \ntherapeutic schools by viewing emotion as a complex syn- \nthesis of exp

Splitting into chunks before embedding

In [21]:
text_splitter = RecursiveCharacterTextSplitter(
    separators="\n",  # Split on newlines
    chunk_size=1100,  # Maximum size of each chunk
    chunk_overlap=300,  # Overlap between chunks to preserve context
    add_start_index=True,  # Include the starting index of each chunk
)

all_splits = text_splitter.split_documents(docs)

In [22]:
print(all_splits[0].page_content)

Emotion in Psychotherapy 
Leslie S. Greenberg 
Jeremy D. Safran 
York University 
Clarke Institute of Psychiatry 
ABSTRACT." The therapeutic process involves many dif- 
ferent types of affective phenomena. No single therapeutic 
perspective has been able to encompass within its own 
theoretical framework all the ways in which emotion plays 
a role in therapeutic change. A comprehensive, constructive 
theory of emotion helps transcend the differences in the 
therapeutic schools by viewing emotion as a complex syn- 
thesis of expressive motor, schematic, and conceptual in- 
formation that provides organisms with information about 
their responses to situations that helps them orient adap- 
tively in the environment. In addition to improved theory, 
increased precision in the assessment of affective func- 
tioning in therapy, as well as greater specification of dif- 
ferent emotional change processes and means of facili- 
tating these, will allow the role of emotion in change to 
be studi

embedding

In [23]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)

len(embeddings), len(embeddings[0])

(5, 1536)

Inserting pdfs into db

In [24]:
pc = Pinecone()
index: Index = pc.Index("pdf-data")  

In [25]:
vector_store = PineconeVectorStore(
    index=index, embedding=OpenAIEmbeddings(model="text-embedding-3-small")
)

In [26]:
ids = [str(i) for i in range(len(all_splits))]

# Add the documents and their embeddings to the vector store
vector_store.add_documents(documents=all_splits, ids=ids)

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

## Loading user journal entries

In [2]:
import sqlite3
import sys
                                             
db_file = 'database\squeaktospeak_db.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_file)

# Create a cursor object to interact with the database
cursor = conn.cursor()
cursor.execute("SELECT * FROM Journal")

# Fetch all rows of the query result
rows = cursor.fetchall()

# Loop through and print each row (if any rows are found)
for row in rows:
    print(row)

# Close the connection
conn.close()

(1, 21, 'Sometimes, it feels like no one understands what I’m going through, but writing helps.', '2024-02-03', '17:28:00', 0)
(2, 26, 'Cooked dinner with my roommates for the first time. It felt nice to share a meal together.', '2024-09-27', '03:13:00', 1)
(3, 6, 'The sunset was breathtaking. I took a moment to just breathe and appreciate the beauty around me.', '2023-08-07', '00:27:00', 1)
(4, 45, 'Feeling proud of the small progress I made today. Every little step counts.', '2023-12-18', '21:56:00', 1)
(5, 11, 'Feeling stuck creatively today. Maybe I need to step away for a bit and recharge.', '2023-02-19', '01:39:00', 0)
(6, 42, 'Had my first therapy session. It feels like a step in the right direction.', '2023-11-02', '16:43:00', 1)
(7, 4, 'Grateful for a surprise call from a friend back home. It made my day!', '2023-02-02', '12:58:00', 1)
(8, 42, 'I keep doubting myself, but I know I have what it takes to succeed.', '2023-09-12', '10:57:00', 1)
(9, 27, 'There’s so much to do, but