In [1]:
import pandas as pd

In [24]:
drama = 'othello'

In [16]:
paragraphs = pd.read_csv(f'{drama}.tsv', sep='\t')

In [17]:
paragraphs.head()

Unnamed: 0,speech,created_at,character
0,"Tush, never tell me! I take it much unkindly T...",946684800000,#Roderigo_Oth
1,"’Sblood, but you’ll not hear me! If ever I did...",946684800001,#Iago_Oth
2,Thou toldst me thou didst hold him in thy hate.,946684800002,#Roderigo_Oth
3,Despise me If I do not. Three great ones of th...,946684800003,#Iago_Oth
4,"By heaven, I rather would have been his hangman.",946684800004,#Roderigo_Oth


In [18]:
sentences = paragraphs.copy()

In [19]:
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer models if you haven't already

from nltk.tokenize import sent_tokenize

sentences['speech_sent'] = sentences['speech'].apply(nltk.sent_tokenize)

sentences = sentences.explode('speech_sent')
# Drop the 'Paragraphs' column if you no longer need it
sentences.drop(columns=['speech'], inplace=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zsomk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
sentences

Unnamed: 0,created_at,character,speech_sent
0,946684800000,#Roderigo_Oth,"Tush, never tell me!"
0,946684800000,#Roderigo_Oth,"I take it much unkindly That thou, Iago, who h..."
1,946684800001,#Iago_Oth,"’Sblood, but you’ll not hear me!"
1,946684800001,#Iago_Oth,"If ever I did dream of such a matter, Abhor me."
2,946684800002,#Roderigo_Oth,Thou toldst me thou didst hold him in thy hate.
...,...,...,...
1181,946684801181,#Lodovico_Oth,"Let it be hid.—Gratiano, keep the house, And s..."
1181,946684801181,#Lodovico_Oth,To Cassio.
1181,946684801181,#Lodovico_Oth,"To you, lord governor, Remains the censure of ..."
1181,946684801181,#Lodovico_Oth,"The time, the place, the torture, O, enforce it."


In [21]:
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # or all-mpnet-base-v2

In [22]:
sentence_embeddings = embedder.embed_documents(sentences['speech_sent'])

In [25]:
sentences['embeddings'] = sentence_embeddings
sentences.to_csv(f'{drama}_embeddings.csv')

In [39]:
list(zip(sentences['speech_sent'].values, sentence_embeddings))[0][1]

[0.00986724067479372,
 -0.007621891796588898,
 0.03612726926803589,
 -0.01146914716809988,
 0.03965059667825699,
 -0.022295786067843437,
 0.05211757868528366,
 -0.05252056568861008,
 0.012287657707929611,
 0.05216572433710098,
 -0.0008633207180537283,
 -0.10315906256437302,
 0.006650043185800314,
 -0.09787499159574509,
 -0.10406609624624252,
 -0.031761251389980316,
 -0.0020388669800013304,
 -0.07604312151670456,
 -0.005888249259442091,
 -0.01777534931898117,
 -0.047719556838274,
 0.024277588352560997,
 -0.009662052616477013,
 0.005540301091969013,
 -0.020119033753871918,
 -0.03749542310833931,
 0.023493831977248192,
 0.030902134254574776,
 -0.144348606467247,
 -0.05813363194465637,
 0.0007006678497418761,
 0.040237609297037125,
 -0.019360585138201714,
 0.06185862794518471,
 -0.03298897668719292,
 0.05486650392413139,
 0.034325599670410156,
 0.011245517991483212,
 0.037520669400691986,
 0.0002799519570544362,
 -0.05343814939260483,
 -0.004747528117150068,
 0.02648722380399704,
 0.043028

In [45]:

text_embedding_pairs = list(zip(sentences['speech_sent'].values, sentence_embeddings))
db = FAISS.from_embeddings(text_embedding_pairs, embedder)

In [15]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = DataFrameLoader(sentences, 'speech_sent').load()
#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
#documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(raw_documents, embedder)

In [46]:
db.save_local(f"{drama}.faiss_index")

In [4]:
db = FAISS.load_local(f"{drama}.faiss_index", embedder)

In [48]:
db.similarity_search_with_score('If ever I did dream of such a matter, Abhor me.')

[(Document(page_content='If ever I did dream of such a matter, Abhor me.'),
  3.0471697e-13),
 (Document(page_content='’Tis a shrewd doubt, though it be but a dream.'),
  1.0715976),
 (Document(page_content='This accident is not unlike my dream.'), 1.0934726),
 (Document(page_content='Pray heaven it be State matters, as you think, and no conception Nor no jealous toy concerning you.'),
  1.132489)]