In [None]:
from langchain.text_splitter import SpacyTextSplitter
from dotenv import load_dotenv
load_dotenv()
with open('sample.txt', 'r') as file:
    text = file.read()

In [None]:
splitter = SpacyTextSplitter(max_length=len(text),
                             separator='.',
                             chunk_size=100,
                             chunk_overlap=0)

texts = splitter.split_text(text)


# Embed texts

In [None]:
from langchain_community.embeddings import CohereEmbeddings

embeddings = CohereEmbeddings()

In [None]:
embedded_docs = embeddings.embed_documents(texts[:10])

In [None]:
import pandas as pd
d = {}
for i, embedding_vec in enumerate(embedded_docs):
    d[i] = [texts[i], embedding_vec]

df = pd.DataFrame(d).T.rename(columns={1:'leaf_embeddings',0:'leaf_text'})
df.head()

# Cluster texts

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=42)
model = model.fit(df['leaf_embeddings'].tolist())
df['cluster_1'] = model.labels_
df

# Summarize Clusers

In [None]:
from langchain_community.llms import Cohere

llm = Cohere(temperature=0.1)

prompt = """
            You are an AI assitant. You are helping with the task of summarization. 
            Below you will find a text fragment, Tell me what is this about. Be as concise as possible with your summary.

            ANSWER ONLY WITH THE SUMMARY. DO NOT INCLUDE THE ORIGINAL TEXT, DO NOT ASK QUESTIONS OR SUGGESTIONS.
            ###

            {paragraph}"""


cluster_summaries = {}
for i in df['cluster_1'].unique():
    cluster_contents = ','.join(df[df['cluster_1'] == i]['leaf_text'].tolist())
    cluster_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
    cluster_summaries[i] = cluster_summary

df['cluster_1_summary'] = df['cluster_1'].map(cluster_summaries)

In [None]:
cluster_summaries_embedded = {}
for summary in df['cluster_1_summary'].unique():
    embedded_summary = embeddings.embed_documents([summary])
    cluster_summaries_embedded[summary] = embedded_summary[0]

# Embed Cluster Summary

In [None]:
df['cluster_1_summary_embeddings'] = df['cluster_1_summary'].map(cluster_summaries_embedded)
df

# Cluster Summaries

In [None]:
model = KMeans(n_clusters=2, random_state=42)
model = model.fit(df['cluster_1_summary_embeddings'].tolist())
df['cluster_2'] = model.labels_
df

In [None]:
cluster_summaries = {}
for i in df['cluster_2'].unique():
    cluster_contents = ','.join(df[df['cluster_2'] == i]['cluster_1_summary'].tolist())
    cluster_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
    cluster_summaries[i] = cluster_summary

df['cluster_2_summary'] = df['cluster_2'].map(cluster_summaries)

In [None]:
cluster_summaries_embedded = {}
for summary in df['cluster_2_summary'].unique():
    embedded_summary = embeddings.embed_documents([summary])
    cluster_summaries_embedded[summary] = embedded_summary[0]
df['cluster_2_summary_embeddings'] = df['cluster_2_summary'].map(cluster_summaries_embedded)
df

In [None]:
from sklearn.cluster import KMeans
prompt = """
        You are an AI assitant. You are helping with the task of summarization. 
        Below you will find a text fragment, Tell me what is this about. Be as concise as possible with your summary.

        ANSWER ONLY WITH THE SUMMARY. DO NOT INCLUDE THE ORIGINAL TEXT, DO NOT ASK QUESTIONS OR SUGGESTIONS.
        ###

        {paragraph}"""

In [55]:
def cluster_and_summarize(texts, n_summaries, embeddings, llm, prompt, n_clusters=5):
    embedded_docs = embeddings.embed_documents(texts)
    d = {i: [texts[i], embedding_vec] for i, embedding_vec in enumerate(embedded_docs)}

    df = pd.DataFrame(d).T.rename(columns={1:'embeddings',0:'text'})

    decrement = (n_clusters - 2) / (n_summaries - 1) if n_summaries > 1 else 0

    prev_clusters = n_clusters
    for i in range(n_summaries):
        model = KMeans(n_clusters=n_clusters, random_state=42)
        model = model.fit(df['embeddings'].tolist())
        df[f'cluster_{i+1}'] = model.labels_

        cluster_summaries = {}
        for j in df[f'cluster_{i+1}'].unique():
            cluster_contents = ','.join(df[df[f'cluster_{i+1}'] == j]['text'].tolist())
            cluster_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
            cluster_summaries[j] = cluster_summary.content

        df[f'cluster_{i+1}_summary'] = df[f'cluster_{i+1}'].map(cluster_summaries)

        cluster_summaries_embedded = {}
        for summary in df[f'cluster_{i+1}_summary'].unique():
            embedded_summary = embeddings.embed_documents([summary])
            cluster_summaries_embedded[summary] = embedded_summary[0]

        df[f'cluster_{i+1}_summary_embeddings'] = df[f'cluster_{i+1}_summary'].map(cluster_summaries_embedded)

        n_clusters = max(2, round(n_clusters - decrement))
        if n_clusters == prev_clusters:
            break
        prev_clusters = n_clusters

    return df

In [56]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

llm = ChatOpenAI(temperature=0.1,model='gpt-3.5-turbo-16k')
embeddings = OpenAIEmbeddings()

df2 = cluster_and_summarize(texts[:20], 5, embeddings, llm, prompt)

In [57]:
df2

Unnamed: 0,text,embeddings,cluster_1,cluster_1_summary,cluster_1_summary_embeddings,cluster_2,cluster_2_summary,cluster_2_summary_embeddings,cluster_3,cluster_3_summary,cluster_3_summary_embeddings,cluster_4,cluster_4_summary,cluster_4_summary_embeddings
0,Creating Possibility Annual Report 2022\n\nFin...,"[-0.014398299777015428, -0.03259191218936567, ...",4,The text fragment is about the financial highl...,"[-0.015863293183037395, -0.019448655854606225,...",1,The text fragment is about the financial highl...,"[-0.015863293183037395, -0.019448655854606225,...",1,The text fragment is about the financial highl...,"[-0.015863293183037395, -0.019448655854606225,...",1,The text fragment is about the financial highl...,"[-0.02292668645458164, -0.009085907842697496, ..."
1,Loans Total assets Deposits Common stockholder...,"[-0.011474125649386758, -0.01552770050571282, ...",4,The text fragment is about the financial highl...,"[-0.015863293183037395, -0.019448655854606225,...",1,The text fragment is about the financial highl...,"[-0.015863293183037395, -0.019448655854606225,...",1,The text fragment is about the financial highl...,"[-0.015863293183037395, -0.019448655854606225,...",1,The text fragment is about the financial highl...,"[-0.02292668645458164, -0.009085907842697496, ..."
2,Refer to Explanation and Reconciliation of the...,"[-0.019256261326194776, 0.012464525877257113, ...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",1,The text fragment is about the financial highl...,"[-0.02292668645458164, -0.009085907842697496, ..."
3,(b) Refer to Liquidity Risk Management on page...,"[-0.012130320044567605, -0.022174121601326157,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",1,The text fragment is about the financial highl...,"[-0.02292668645458164, -0.009085907842697496, ..."
4,(c) Refer to Capital Risk Management on pages ...,"[-0.010517001487467372, -0.02907831500825693, ...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",2,The text fragment discusses the use of non-GAA...,"[-0.008381120289931708, 0.0024499414611671663,...",1,The text fragment is about the financial highl...,"[-0.02292668645458164, -0.009085907842697496, ..."
5,JPMorgan Chase & Co. (NYSE: JPM) is a leading ...,"[-0.024302320290818046, -0.01623079163083399, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.021952704607049768, -0.028613949948288423,...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.02103980663160506, -0.028932936936840573, ..."
6,"The firm is a leader in investment banking, fi...","[-0.010912377237709702, -0.01123408678421719, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.021952704607049768, -0.028613949948288423,...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.02103980663160506, -0.028932936936840573, ..."
7,"Under the J.P. Morgan and Chase brands, the fi...","[-0.0058306657755502, -0.011437934284332199, 0...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.021952704607049768, -0.028613949948288423,...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.02103980663160506, -0.028932936936840573, ..."
8,Information about J.P. Morgan’s capabilities c...,"[-0.012019596214541272, -0.010757696117278046,...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.021952704607049768, -0.028613949948288423,...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.02103980663160506, -0.028932936936840573, ..."
9,Information about JPMorgan Chase & Co. is avai...,"[-0.017228708053864873, -0.0020621549895017974...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.022770403864198912, -0.03368068993756266, ...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.021952704607049768, -0.028613949948288423,...",0,JPMorgan Chase & Co. had a successful year in ...,"[-0.02103980663160506, -0.028932936936840573, ..."
