In [10]:
from langchain.text_splitter import SpacyTextSplitter
from dotenv import load_dotenv
load_dotenv()
with open('sample.txt', 'r') as file:
    text = file.read()

In [None]:
splitter = SpacyTextSplitter(max_length=len(text),
                             separator='.',
                             chunk_size=100,
                             chunk_overlap=0)

texts = splitter.split_text(text)


# Embed texts

In [12]:
from langchain_community.embeddings import CohereEmbeddings

embeddings = CohereEmbeddings()

In [15]:
embedded_docs = embeddings.embed_documents(texts[:10])

In [28]:
import pandas as pd
d = {}
for i, embedding_vec in enumerate(embedded_docs):
    d[i] = [texts[i], embedding_vec]

df = pd.DataFrame(d).T.rename(columns={1:'Embeddings',0:'Text'})
df.head()

Unnamed: 0,Text,Embeddings
0,Creating Possibility Annual Report 2022\n\nFin...,"[0.5649414, -0.45361328, 2.6953125, -0.7851562..."
1,Loans Total assets Deposits Common stockholder...,"[0.20666504, -0.8261719, -0.072509766, 0.23596..."
2,Refer to Explanation and Reconciliation of the...,"[0.09991455, -0.31176758, -0.27978516, 0.58154..."
3,(b) Refer to Liquidity Risk Management on page...,"[-0.31518555, 0.008300781, -0.3034668, 0.74267..."
4,(c) Refer to Capital Risk Management on pages ...,"[-0.46044922, -0.49365234, -0.28686523, 0.5815..."


# Cluster texts

In [38]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=42)
model = model.fit(df['Embeddings'].tolist())
df['Cluster'] = model.labels_
df

# Summarize Clusers

In [69]:
from langchain_community.llms import Cohere

llm = Cohere(temperature=0.1)

prompt = """
            You are an AI assitant. You are helping with the task of summarization. 
            Below you will find a text fragment, Tell me what is this about. Be as concise as possible with your summary.

            ANSWER ONLY WITH THE SUMMARY. DO NOT INCLUDE THE ORIGINAL TEXT, DO NOT ASK QUESTIONS OR SUGGESTIONS.
            ###

            {paragraph}"""


cluster_summaries = {}
for i in df['Cluster'].unique():
    cluster_contents = ','.join(df[df['Cluster'] == i]['Text'].tolist())
    cluester_summary = llm.invoke(prompt.format(paragraph=cluster_contents))
    cluster_summaries[i] = cluester_summary

df['Cluster Summary'] = df['Cluster'].map(cluster_summaries)

In [73]:
cluster_summaries_embedded = {}
for summary in df['Cluster Summary'].unique():
    embedded_summary = embeddings.embed_documents([summary])
    cluster_summaries_embedded[summary] = embedded_summary[0]

# Embed Cluster Summary

In [74]:
df['Cluster Summary Embeddings'] = df['Cluster Summary'].map(cluster_summaries_embedded)
df

Unnamed: 0,Text,Embeddings,Cluster,Cluster Summary,Cluster Summary Embeddings
0,Creating Possibility Annual Report 2022\n\nFin...,"[0.5649414, -0.45361328, 2.6953125, -0.7851562...",2,Creating Possibility Annual Report 2022 shows...,"[1.7529297, 0.26586914, 2.2890625, 0.64697266,..."
1,Loans Total assets Deposits Common stockholder...,"[0.20666504, -0.8261719, -0.072509766, 0.23596...",2,Creating Possibility Annual Report 2022 shows...,"[1.7529297, 0.26586914, 2.2890625, 0.64697266,..."
2,Refer to Explanation and Reconciliation of the...,"[0.09991455, -0.31176758, -0.27978516, 0.58154...",0,The text refers to the explanation and reconc...,"[0.3857422, -0.18554688, -1.3173828, 0.8598633..."
3,(b) Refer to Liquidity Risk Management on page...,"[-0.31518555, 0.008300781, -0.3034668, 0.74267...",0,The text refers to the explanation and reconc...,"[0.3857422, -0.18554688, -1.3173828, 0.8598633..."
4,(c) Refer to Capital Risk Management on pages ...,"[-0.46044922, -0.49365234, -0.28686523, 0.5815...",0,The text refers to the explanation and reconc...,"[0.3857422, -0.18554688, -1.3173828, 0.8598633..."
5,JPMorgan Chase & Co. (NYSE: JPM) is a leading ...,"[1.3066406, 2.1132812, -0.32495117, 0.81396484...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2..."
6,"The firm is a leader in investment banking, fi...","[2.4003906, 1.1074219, 0.09643555, 2.0566406, ...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2..."
7,"Under the J.P. Morgan and Chase brands, the fi...","[1.0986328, 3.3242188, 1.1083984, 2.5976562, 0...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2..."
8,Information about J.P. Morgan’s capabilities c...,"[-0.4008789, 2.1601562, 0.6503906, 0.31762695,...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2..."
9,Information about JPMorgan Chase & Co. is avai...,"[1.1650391, 2.1289062, 0.1282959, 1.9863281, 0...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2..."


# Cluster Summaries

In [79]:
model = KMeans(n_clusters=2, random_state=42)
model = model.fit(df['Cluster Summary Embeddings'].tolist())
df['Cluster_2'] = model.labels_
df

Unnamed: 0,Text,Embeddings,Cluster,Cluster Summary,Cluster Summary Embeddings,Cluster_2
0,Creating Possibility Annual Report 2022\n\nFin...,"[0.5649414, -0.45361328, 2.6953125, -0.7851562...",2,Creating Possibility Annual Report 2022 shows...,"[1.7529297, 0.26586914, 2.2890625, 0.64697266,...",1
1,Loans Total assets Deposits Common stockholder...,"[0.20666504, -0.8261719, -0.072509766, 0.23596...",2,Creating Possibility Annual Report 2022 shows...,"[1.7529297, 0.26586914, 2.2890625, 0.64697266,...",1
2,Refer to Explanation and Reconciliation of the...,"[0.09991455, -0.31176758, -0.27978516, 0.58154...",0,The text refers to the explanation and reconc...,"[0.3857422, -0.18554688, -1.3173828, 0.8598633...",0
3,(b) Refer to Liquidity Risk Management on page...,"[-0.31518555, 0.008300781, -0.3034668, 0.74267...",0,The text refers to the explanation and reconc...,"[0.3857422, -0.18554688, -1.3173828, 0.8598633...",0
4,(c) Refer to Capital Risk Management on pages ...,"[-0.46044922, -0.49365234, -0.28686523, 0.5815...",0,The text refers to the explanation and reconc...,"[0.3857422, -0.18554688, -1.3173828, 0.8598633...",0
5,JPMorgan Chase & Co. (NYSE: JPM) is a leading ...,"[1.3066406, 2.1132812, -0.32495117, 0.81396484...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2...",1
6,"The firm is a leader in investment banking, fi...","[2.4003906, 1.1074219, 0.09643555, 2.0566406, ...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2...",1
7,"Under the J.P. Morgan and Chase brands, the fi...","[1.0986328, 3.3242188, 1.1083984, 2.5976562, 0...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2...",1
8,Information about J.P. Morgan’s capabilities c...,"[-0.4008789, 2.1601562, 0.6503906, 0.31762695,...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2...",1
9,Information about JPMorgan Chase & Co. is avai...,"[1.1650391, 2.1289062, 0.1282959, 1.9863281, 0...",1,"JPMorgan Chase, a worldwide leading financial...","[1.6777344, 1.4287109, 1.4042969, 0.7504883, 2...",1
