In [6]:
# Import text from Excel worksheet.

import pandas as pd

file_path = "SampleDocuments2.xlsx"
file_df = pd.read_excel(file_path)
file_df

Unnamed: 0,doc_id,doc_text
0,1,The cat (Felis catus) is a domestic species of...
1,2,It is the only domesticated species in the fam...
2,3,Cats are commonly kept as house pets but can a...
3,4,Domestic cats are valued by humans for compani...


In [2]:
# Only needs to run if package not already installed.

!pip install spacy



In [3]:
# Only needs to run if not already installed.

!python -m spacy download en_core_web_sm

OMP: Error #15: Initializing libiomp5.dylib, but found libomp.dylib already initialized.
OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://www.intel.com/software/products/support/.


In [7]:
# Split text in each cell into lists with separate sentences.

import spacy

nlp = spacy.load("en_core_web_sm")
file_df["doc_text"] = file_df["doc_text"].apply(lambda x: [sent.text for sent in nlp(x).sents])
file_df

Unnamed: 0,doc_id,doc_text
0,1,[The cat (Felis catus) is a domestic species o...
1,2,[It is the only domesticated species in the fa...
2,3,[Cats are commonly kept as house pets but can ...
3,4,[Domestic cats are valued by humans for compan...


In [8]:
# Put each sentence into a separate row in the dataframe.

file_df = file_df.explode("doc_text", ignore_index=True)

# Update the column headings.

file_df.rename(columns={"doc_text": "sent_text"}, inplace=True)
file_df.index.name = "sent_id"

# Show the contents of the dataframe.

file_df

Unnamed: 0_level_0,doc_id,sent_text
sent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,The cat (Felis catus) is a domestic species of...
1,1,The dog (Canis familiaris[4][5] or Canis lupus...
2,1,A computer is a machine that can be programmed...
3,2,It is the only domesticated species in the fam...
4,2,"Also called the domestic dog, it is derived fr..."
5,2,Modern digital electronic computers can perfor...
6,3,Cats are commonly kept as house pets but can a...
7,3,Dogs were the first species to be domesticated...
8,3,These programs enable computers to perform a w...
9,4,Domestic cats are valued by humans for compani...


In [9]:
# Add the sentence text and IDs to lists for further processing.

sent_id = file_df.index.values.tolist()
sent_text = file_df.sent_text.values.tolist()
sent_id

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [12]:
# Only needs to run if not already installed.

!pip install sentence-transformers



In [10]:
# Select the model to use for sentence embeddings.

from sentence_transformers import SentenceTransformer

# Models - https://huggingface.co/models?library=sentence-transformers
model = SentenceTransformer('all-mpnet-base-v2')

In [11]:
# Create the sentence embeddings

embeddings1 = model.encode(sent_text)

In [12]:
# clustering https://www.youtube.com/watch?v=OlhNZg4gOvA time index 22:00

from sklearn.cluster import KMeans
import numpy as np

# normalize the embeddings to unit length
embeddings_norm = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)

# Show the embeddings dataframe.
embeddings_norm

array([[-0.00439168,  0.03730019, -0.00832697, ...,  0.06012901,
         0.05588695, -0.03074687],
       [ 0.00139501, -0.01426283, -0.00580778, ...,  0.03672078,
         0.0372525 , -0.00364629],
       [-0.01490693,  0.07147522, -0.01173318, ...,  0.04738557,
         0.01796244, -0.03525241],
       ...,
       [ 0.03970142,  0.00837464,  0.01572062, ...,  0.04809455,
        -0.01922227, -0.00996133],
       [-0.01320439,  0.02343231,  0.00986621, ...,  0.03478407,
         0.00996889, -0.02609491],
       [-0.0422748 , -0.05179688, -0.01553572, ...,  0.01847669,
         0.00551961, -0.03248133]], dtype=float32)

In [13]:
# Import the libraries needed to create the elbow diagram.

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
# Look at the elbow diagram to help determine appropriate number of clusters to create.

# determining the maximum number of clusters
# using the simple method
limit = file_df.shape[0]
 
# selecting optimal value of 'k'
# using elbow method
 
# wcss - within cluster sum of
# squared distances
wcss = {}
 
for k in range(2,limit+1):
    model = KMeans(n_clusters=k)
    model.fit(embeddings_norm)
    wcss[k] = model.inertia_
     
# plotting the wcss values
# to find out the elbow value
plt.plot(wcss.keys(), wcss.values(), 'gs-')
plt.xlabel('Values of "k"')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Create the desired number of clusters. (Update the value for ClusterCount.)

ClusterCount = 3

clustering_model = KMeans(n_clusters=ClusterCount)
clustering_model.fit(embeddings_norm)
cluster_assignment = clustering_model.labels_
print(cluster_assignment)

In [None]:
# Add the cluster assignments to the dataframe in a new column.

file_df['cluster3'] = cluster_assignment
file_df

In [None]:
# Get file path in preparation for saving output to Excel.

import os

OutputFile = os.path.split(file_path)[0] + "\\Clusters_" + os.path.split(file_path)[1]
OutputFile

In [None]:
# Save the dataframe to an Excel workbook in the same folder as the original file.

with pd.ExcelWriter(OutputFile) as writer:
    
    # Write the scores dataframe to the Excel workbook. Leave blank rows at the top.
    file_df.to_excel(writer, sheet_name="Sheet1", startrow=0, startcol=0)
    
print('Done.')