In [72]:
from dotenv import load_dotenv
from google import genai
from google.genai import types
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [6]:
client = genai.Client(api_key=GOOGLE_API_KEY)
for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(f"Model: {m.name}")

Model: models/embedding-001
Model: models/text-embedding-004
Model: models/gemini-embedding-exp-03-07
Model: models/gemini-embedding-exp


### Texts

In [46]:
text = open("data/Psychology2e_WEB_pdfminer_trimmed.txt", "rb").readlines()

In [59]:
sub_heading = set()
is_heading = False
for line in text:
    if line == b"CHAPTER OUTLINE\r\n":
        is_heading = True
    elif line == b"\r\n":
        is_heading = False
    
    if is_heading and line != b"CHAPTER OUTLINE\r\n":
        sub_heading.add(line)


In [74]:
current_heading = b"Introduction to Psychology"
text_dictionary = {current_heading: b""}
for header in sub_heading:
    text_dictionary[header] = header

for line in text:
    if line not in sub_heading: 
        text_dictionary[current_heading] += line
    else: 
        current_heading = line

In [None]:
context_texts = list(text_dictionary.values())
encoded_texts = client.models.embed_content(
    model="models/text-embedding-004",
    contents=context_texts,
    config=types.EmbedContentConfig(task_type="semantic_similarity")
)

In [106]:
pandas_text = pd.DataFrame(text_dictionary.items())
encoded_values = pd.Series([e.values for e in encoded_texts.embeddings])
pandas_text = pd.concat([pandas_text, encoded_values], axis=1, ignore_index=True)
pandas_text.columns = ["heading", "text", "embedding"]
pandas_text

Unnamed: 0,heading,text,embedding
0,b'Introduction to Psychology',b'Introduction to Psychology\r\n\r\n1\r\n\r\nF...,"[-0.07546733, 0.041844938, -0.049030498, -0.00..."
1,b'2.2 Approaches to Research\r\n',"b""2.2 Approaches to Research\r\nLEARNING OBJEC...","[-0.036459424, -0.009044605, -0.05252878, 0.05..."
2,b'15.1 What Are Psychological Disorders?\r\n',"b""15.1 What Are Psychological Disorders?\r\nLE...","[0.000678227, 0.040446203, -0.07473351, -0.025..."
3,b'11.9 Personality Assessment\r\n',b'11.9 Personality Assessment\r\n\r\nThree mon...,"[-0.0014185614, 0.015965123, -0.03640059, 0.00..."
4,b'15.5 Obsessive-Compulsive and Related Disord...,b'15.5 Obsessive-Compulsive and Related Disord...,"[-0.015047858, 0.027823124, -0.030617189, -0.0..."
...,...,...,...
84,b'8.3 Problems with Memory\r\n',"b""8.3 Problems with Memory\r\nLEARNING OBJECTI...","[0.029385079, -0.0054541607, -0.061692953, -0...."
85,b'12.3 Attitudes and Persuasion\r\n',b'12.3 Attitudes and Persuasion\r\nLEARNING OB...,"[-0.01957869, 0.022944044, -0.047297202, 0.009..."
86,b'1.4 Careers in Psychology\r\n',b'1.4 Careers in Psychology\r\n\r\nINTRODUCTIO...,"[0.028797848, 0.00783937, -0.06044895, 0.02265..."
87,b'12.5 Prejudice and Discrimination\r\n',b'12.5 Prejudice and Discrimination\r\nLEARNIN...,"[-0.018282274, -0.008219105, -0.020435264, 0.0..."


```python
changed_text = [""]
index = 0
for line in tqdm(text):
    if line == b"\r\n":
        index += 1
        changed_text.append("")
    else:
        changed_text[index] += line.decode("utf-8").strip() + " "
test_changed = changed_text[0:200]
```

### Queries

In [112]:
queries = pd.read_json("Data/queries.json")
queries

Unnamed: 0,query_id,question
0,1,What is the scientific method in psychology?
1,2,What are the basic parts of a neuron?
2,3,What are the stages of sleep?
3,4,What is operant conditioning?
4,5,What is problem-solving in psychology?
5,6,What are the three stages of memory?
6,7,What are the key components of emotion?
7,8,What are the major personality traits in the F...
8,9,What is social psychology?
9,10,What is the sociocultural model in therapy?


In [113]:
context_queries = list(queries["question"])
encoded_queries = client.models.embed_content(
    model="models/text-embedding-004",
    contents=context_queries,
    config=types.EmbedContentConfig(task_type="semantic_similarity")
)

In [115]:
query_embedding = pd.Series([q.values for q in encoded_queries.embeddings], name="embedding")
query_dataset = pd.concat([queries, query_embedding], axis=1)
query_dataset

Unnamed: 0,query_id,question,embedding
0,1,What is the scientific method in psychology?,"[-0.04588239, -0.0069801663, -0.060324635, 0.0..."
1,2,What are the basic parts of a neuron?,"[-0.012382562, 0.0035106149, -0.06658548, -0.0..."
2,3,What are the stages of sleep?,"[-0.009869246, 0.044050507, -0.07744952, 0.005..."
3,4,What is operant conditioning?,"[-0.0537858, 0.059052367, -0.010571696, 0.0234..."
4,5,What is problem-solving in psychology?,"[-0.025538992, -0.010091021, -0.032606736, -0...."
5,6,What are the three stages of memory?,"[-0.005208647, 0.023910541, -0.06945862, 0.007..."
6,7,What are the key components of emotion?,"[0.02735984, 0.0073419423, -0.019886509, -0.03..."
7,8,What are the major personality traits in the F...,"[-0.02642403, 0.04769592, -0.013481175, -0.013..."
8,9,What is social psychology?,"[-0.03505938, 0.045698382, -0.036667515, -0.00..."
9,10,What is the sociocultural model in therapy?,"[-0.023964964, 0.02008327, -0.043191914, 8.846..."


### Query vs Embedding

In [124]:
query_data = np.array([np.array(q) for q in query_dataset["embedding"]])
text_data = np.array([np.array(t) for t in pandas_text["embedding"]])
similarity = np.dot(query_data, text_data.T)
similarity.shape

(50, 89)

In [None]:
import seaborn as sns

