In [None]:
from langchain.text_splitter import SpacyTextSplitter
from dotenv import load_dotenv
load_dotenv()
with open('sample.txt', 'r') as file:
    text = file.read()

In [None]:
splitter = SpacyTextSplitter(max_length=len(text),
                             separator='.',
                             chunk_size=100,
                             chunk_overlap=0)

texts = splitter.split_text(text)


# Embed texts

In [None]:
from langchain_community.embeddings import CohereEmbeddings

embeddings = CohereEmbeddings()

In [None]:
embedded_docs = embeddings.embed_documents(texts[:10])

In [None]:
import pandas as pd
d = {}
for i, embedding_vec in enumerate(embedded_docs):
    d[i] = [texts[i], embedding_vec]

df = pd.DataFrame(d).T.rename(columns={1:'leaf_embeddings',0:'leaf_text'})
df.head()

# Cluster texts

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=42)
model = model.fit(df['leaf_embeddings'].tolist())
df['cluster_1'] = model.labels_
df

# Summarize Clusers

In [None]:
from langchain_community.llms import Cohere

llm = Cohere(temperature=0.1)

prompt = """
            You are an AI assitant. You are helping with the task of summarization. 
            Below you will find a text fragment, Tell me what is this about. Be as concise as possible with your summary.

            ANSWER ONLY WITH THE SUMMARY. DO NOT INCLUDE THE ORIGINAL TEXT, DO NOT ASK QUESTIONS OR SUGGESTIONS.
            ###

            {paragraph}"""


cluster_summaries = {}
for i in df['cluster_1'].unique():
    cluster_contents = ','.join(df[df['cluster_1'] == i]['leaf_text'].tolist())
    cluster_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
    cluster_summaries[i] = cluster_summary

df['cluster_1_summary'] = df['cluster_1'].map(cluster_summaries)

In [None]:
cluster_summaries_embedded = {}
for summary in df['cluster_1_summary'].unique():
    embedded_summary = embeddings.embed_documents([summary])
    cluster_summaries_embedded[summary] = embedded_summary[0]

# Embed Cluster Summary

In [None]:
df['cluster_1_summary_embeddings'] = df['cluster_1_summary'].map(cluster_summaries_embedded)
df

# Cluster Summaries

In [None]:
model = KMeans(n_clusters=2, random_state=42)
model = model.fit(df['cluster_1_summary_embeddings'].tolist())
df['cluster_2'] = model.labels_
df

In [None]:
cluster_summaries = {}
for i in df['cluster_2'].unique():
    cluster_contents = ','.join(df[df['cluster_2'] == i]['cluster_1_summary'].tolist())
    cluster_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
    cluster_summaries[i] = cluster_summary

df['cluster_2_summary'] = df['cluster_2'].map(cluster_summaries)

In [None]:
cluster_summaries_embedded = {}
for summary in df['cluster_2_summary'].unique():
    embedded_summary = embeddings.embed_documents([summary])
    cluster_summaries_embedded[summary] = embedded_summary[0]
df['cluster_2_summary_embeddings'] = df['cluster_2_summary'].map(cluster_summaries_embedded)
df

In [None]:
from sklearn.cluster import KMeans
prompt = """
        You are an AI assitant. You are helping with the task of summarization. 
        Below you will find a text fragment, Tell me what is this about. Be as concise as possible with your summary.

        ANSWER ONLY WITH THE SUMMARY. DO NOT INCLUDE THE ORIGINAL TEXT, DO NOT ASK QUESTIONS OR SUGGESTIONS.
        ###

        {paragraph}"""

In [None]:
def cluster_and_summarize(texts, n_summaries, embeddings, llm, prompt, n_clusters=5):
    embedded_docs = embeddings.embed_documents(texts)
    d = {i: [texts[i], embedding_vec] for i, embedding_vec in enumerate(embedded_docs)}

    df = pd.DataFrame(d).T.rename(columns={1:'embeddings',0:'text'})

    decrement = (n_clusters - 2) / (n_summaries - 1) if n_summaries > 1 else 0

    prev_clusters = n_clusters
    for i in range(n_summaries):
        model = KMeans(n_clusters=n_clusters, random_state=42)
        model = model.fit(df['embeddings'].tolist())
        df[f'cluster_{i+1}'] = model.labels_

        cluster_summaries = {}
        for j in df[f'cluster_{i+1}'].unique():
            cluster_contents = ','.join(df[df[f'cluster_{i+1}'] == j]['text'].tolist())
            cluster_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
            cluster_summaries[j] = cluster_summary.content

        df[f'cluster_{i+1}_summary'] = df[f'cluster_{i+1}'].map(cluster_summaries)

        cluster_summaries_embedded = {}
        for summary in df[f'cluster_{i+1}_summary'].unique():
            embedded_summary = embeddings.embed_documents([summary])
            cluster_summaries_embedded[summary] = embedded_summary[0]

        df[f'cluster_{i+1}_summary_embeddings'] = df[f'cluster_{i+1}_summary'].map(cluster_summaries_embedded)

        n_clusters = max(2, round(n_clusters - decrement))
        if n_clusters == prev_clusters:
            break
        prev_clusters = n_clusters

    return df

In [None]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

llm = ChatOpenAI(temperature=0.1,model='gpt-3.5-turbo-16k')
embeddings = OpenAIEmbeddings()

df2 = cluster_and_summarize(texts[:20], 5, embeddings, llm, prompt)

### Create a Documents objects with 

In [None]:
df2.head()

In [48]:

text_cols

['text',
 'cluster_1_summary',
 'cluster_2_summary',
 'cluster_3_summary',
 'cluster_4_summary']

In [46]:
from langchain.schema.document import Document

def preprocess_texts(df):
    text_cols = ['text'] + [col for col in df.columns if 'summary' == col.split('_')[-1]]
    docs = []
    for index, values in df[text_cols].iterrows():
        for i in range(len(text_cols)):
            try:
                metadata = {'cluster_summary': values.iloc[i+1],
                            'node_position': i,}
            except IndexError:
                metadata = {'cluster_summary': 'root node',
                            'node_position': 'root node',}
            doc = Document(values.iloc[0], metadata=metadata)
            docs.append(doc)
    return docs

In [62]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain.vectorstores.base import VectorStore

client = QdrantClient('http://localhost:6333')
vector_db = Qdrant(client,collection_name='test_collection',embeddings=embeddings)

async def embed_docs(vector_db: VectorStore, docs, embeddings) -> None:
    await vector_db.afrom_documents(docs, embedding=embeddings)

In [63]:
await embed_docs(vector_db, preprocess_texts(df2), embeddings)