In [23]:
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
import openai
import os
from dotenv import load_dotenv
load_dotenv('../.env')
openai.api_key = os.getenv("OPENAI_API_KEY")



In [24]:
df = pd.read_json('./enneads.json')

In [25]:
df.head()

Unnamed: 0,enneadTitle,tractateTitle,sectionTitle,lines
0,THE FIRST ENNEAD,FIRST TRACTATE.,Section 1,"Section 1 \n 1. Pleasure and distress, fear an..."
1,THE FIRST ENNEAD,FIRST TRACTATE.,Section 2,Section 2 \n 2. This first enquiry obliges us ...
2,THE FIRST ENNEAD,FIRST TRACTATE.,Section 3,Section 3 \n 3. We may treat of the Soul as in...
3,THE FIRST ENNEAD,FIRST TRACTATE.,Section 4,"Section 4 \n 4. Let us consider, then, the hyp..."
4,THE FIRST ENNEAD,FIRST TRACTATE.,Section 5,Section 5 \n 5. Now this Animate might be mere...


In [26]:
df["combined"] = (
    "Plotinus:\n" + df.enneadTitle.str.strip() + "\nTractate Title: " + df.tractateTitle.str.strip() 
    # add sectionTitle and lines
    + "\n" + df.lines.str.strip()
)

In [27]:
df.head()

Unnamed: 0,enneadTitle,tractateTitle,sectionTitle,lines,combined
0,THE FIRST ENNEAD,FIRST TRACTATE.,Section 1,"Section 1 \n 1. Pleasure and distress, fear an...",Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...
1,THE FIRST ENNEAD,FIRST TRACTATE.,Section 2,Section 2 \n 2. This first enquiry obliges us ...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...
2,THE FIRST ENNEAD,FIRST TRACTATE.,Section 3,Section 3 \n 3. We may treat of the Soul as in...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...
3,THE FIRST ENNEAD,FIRST TRACTATE.,Section 4,"Section 4 \n 4. Let us consider, then, the hyp...",Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...
4,THE FIRST ENNEAD,FIRST TRACTATE.,Section 5,Section 5 \n 5. Now this Animate might be mere...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...


In [28]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


In [29]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
without_too_long = df[df.n_tokens <= max_tokens].tail(top_n)

In [30]:
first_ennead = df[df.enneadTitle == "THE FIRST ENNEAD"].head()
first_ennead

Unnamed: 0,enneadTitle,tractateTitle,sectionTitle,lines,combined,n_tokens
0,THE FIRST ENNEAD,FIRST TRACTATE.,Section 1,"Section 1 \n 1. Pleasure and distress, fear an...",Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,254
1,THE FIRST ENNEAD,FIRST TRACTATE.,Section 2,Section 2 \n 2. This first enquiry obliges us ...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,589
2,THE FIRST ENNEAD,FIRST TRACTATE.,Section 3,Section 3 \n 3. We may treat of the Soul as in...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,459
3,THE FIRST ENNEAD,FIRST TRACTATE.,Section 4,"Section 4 \n 4. Let us consider, then, the hyp...",Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,618
4,THE FIRST ENNEAD,FIRST TRACTATE.,Section 5,Section 5 \n 5. Now this Animate might be mere...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,674


In [31]:
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_json("embeddings.json")


In [64]:
df.head()

Unnamed: 0,enneadTitle,tractateTitle,sectionTitle,lines,combined,n_tokens,embedding
0,THE FIRST ENNEAD,FIRST TRACTATE.,Section 1,"Section 1 \n 1. Pleasure and distress, fear an...",Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,254,"[-0.005241681821644306, -0.011729962192475796,..."
1,THE FIRST ENNEAD,FIRST TRACTATE.,Section 2,Section 2 \n 2. This first enquiry obliges us ...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,589,"[-0.004125811625272036, 0.0019692867062985897,..."
2,THE FIRST ENNEAD,FIRST TRACTATE.,Section 3,Section 3 \n 3. We may treat of the Soul as in...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,459,"[-0.010390679351985455, -0.0018806734587997198..."
3,THE FIRST ENNEAD,FIRST TRACTATE.,Section 4,"Section 4 \n 4. Let us consider, then, the hyp...",Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,618,"[-0.010853996500372887, -0.007171030156314373,..."
4,THE FIRST ENNEAD,FIRST TRACTATE.,Section 5,Section 5 \n 5. Now this Animate might be mere...,Plotinus:\nTHE FIRST ENNEAD\nTractate Title: F...,674,"[-0.017799487337470055, -0.0033750722650438547..."
...,...,...,...,...,...,...,...
647,THE SIXTH ENNEAD,NINTH TRACTATE.,Section 6,"Section 6 \n 6. In what sense, then, do we ass...",Plotinus:\nTHE SIXTH ENNEAD\nTractate Title: N...,936,"[-0.005158691667020321, 0.016849299892783165, ..."
648,THE SIXTH ENNEAD,NINTH TRACTATE.,Section 7,Section 7 \n 7. If the mind reels before somet...,Plotinus:\nTHE SIXTH ENNEAD\nTractate Title: N...,468,"[0.009452428668737411, -0.001767597277648747, ..."
649,THE SIXTH ENNEAD,NINTH TRACTATE.,Section 8,Section 8 \n 8. Every soul that knows its hist...,Plotinus:\nTHE SIXTH ENNEAD\nTractate Title: N...,667,"[-0.001969702076166868, 0.0037387553602457047,..."
650,THE SIXTH ENNEAD,NINTH TRACTATE.,Section 9,"Section 9 \n 9. In this choiring, the soul loo...",Plotinus:\nTHE SIXTH ENNEAD\nTractate Title: N...,926,"[-0.0009392640204168856, -0.000636529468465596..."


In [63]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
import numpy as np


datafile_path = "./embeddings.json"
from_json = pd.read_csv(datafile_path).sample(50)

# Convert to a list of lists of floats
matrix = np.array(from_json.embedding.apply(eval).to_list())

# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)
vis_dims.shape


ValueError: a must be greater than 0 unless no samples are taken