In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pylcs

In [2]:
def similarity(s1:str, s2:str, model_name='paraphrase-mpnet-base-v2'):
    
    model = SentenceTransformer(model_name)
    
    # encode sentences to get their embeddings
    embedding1 = model.encode(s1, convert_to_tensor=True)
    embedding2 = model.encode(s2, convert_to_tensor=True)
    
    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    
    return cosine_scores.item()

In [3]:
def topic(s1:str):
    # Returns the topic extracted from the entire output.
    
    return s1[:s1.find('.')]

In [4]:
def compare_similarity(s1:str, s2:str):
    
    print(f'String 1 : {s1}')
    print(f'String 2 : {s2}\n')
    
    print('Similarity :',similarity(s1,s2))

In [5]:
txtfile = open('../data/I2_1_1000.txt','r')
biz_ideas = [line.rstrip('\n') for line in txtfile]

In [6]:
for i in range(5,1):
    print(i,topic(biz_ideas[i]))

In [7]:
compare_similarity(topic(biz_ideas[13]), topic(biz_ideas[4]))

String 1 : Concrete Cutting and Grinding business
String 2 : Con-crete Company

Similarity : 0.2693554759025574


In [8]:
def preprocess(data:list):
    
    data = [datum.lower() for datum in data]
    data = [datum for datum in set(data)]
    
    return data

In [9]:
def overlap(s1:str, s2:str):
    
    if len(s1) > len(s2):
        return pylcs.lcs(s1,s2)/float(len(s1))
    else:
        return pylcs.lcs(s1,s2)/float(len(s2))

In [10]:
overlap('beauty salon','a beauty salon')

0.8571428571428571

In [11]:
# Create a list of topics from the corpus

topics_unprocessed = []

for idea in biz_ideas:

    topics_unprocessed.append(topic(idea))

topics = preprocess(topics_unprocessed)

for topic_ in topics[:5]:
    print(topic_)

small business
internet internet service
diving and snorkeling business
grocery and home shopping
web designing service


In [12]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')
topics_embeddings_unprocessed = model.encode(topics_unprocessed)
topics_embeddings = model.encode(topics)

In [13]:
print(topics_embeddings_unprocessed.shape)
print(topics_embeddings.shape)

(1000, 768)
(628, 768)


In [14]:
# Top K similar ideas

topic_query = 'Beauty Parlor'
query_embedding = model.encode(topic_query)

top_k = 10

cos_scores = util.pytorch_cos_sim(query_embedding, topics_embeddings)[0]
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

"""
filtered_results = []

for i in range(len(top_results)-1):
    filtered = pylcs.lcs2_of_list(top_results[i], top_results[i:]) / float(len(top_results[i])) < 0.7
    filtered_results.append(top_results[filtered])
"""   

print("Sentence:", topic_query, "\n")
print(f'Top {top_k} most similar sentences in corpus:')

for idx in top_results[0:top_k]:
    print(topics[idx], "(Score: %.4f)" % (cos_scores[idx]))

Sentence: Beauty Parlor 

Top 10 most similar sentences in corpus:
beauty salon (Score: 0.9091)
a beauty salon (Score: 0.9033)
beauty shop (Score: 0.8997)
beauty salons (Score: 0.8859)
hair care salon (Score: 0.7918)
a hair salon (Score: 0.7792)
hair and beauty salons (Score: 0.7688)
hairdressing salon (Score: 0.7607)
hair and grooming salon (Score: 0.7514)
hair and nail salon (Score: 0.7510)


In [15]:
# Clustering : KMeans

def kmeans(num_clusters, data:list):
    
    from sklearn.cluster import KMeans
        
    clustering_model = KMeans(n_clusters = num_clusters)
    clustering_model.fit(data)
    cluster_assignment = clustering_model.labels_

    return cluster_assignment

In [16]:
def optimize_k(data:list):
    
    from sklearn.cluster import KMeans
    import math
    
    dists = []
    K = range(1,70)
        
    for n in K:
        k_model = KMeans(n_clusters = n)
        k_model.fit(data)
        dists.append(k_model.inertia_)
        
    def calc_dist(x1,y1,a,b,c):
        return abs((a*x1 + b*y1 + c))/(math.sqrt(a**2 + b**2))
        
    a = dists[0] - dists[-1]
    b = K[-1] - K[0]
    c1 = K[0] * dists[-1]
    c2 = K[-1] * dists[0]
    c = c1 - c2
        
    dists_line = []

    for k in range(K[-1]):
        dists_line.append(calc_dist(K[k], dists[k], a, b, c))
            
    num_clusters = dists_line.index(max(dists_line))+1
        
    return num_clusters

In [17]:
# Alternative Approach: DBSCAN (Does not seem to work well with semantics)

def dbscan(data:list):
    
    from sklearn.cluster import DBSCAN

    db_default = DBSCAN(eps = 0.0375, min_samples = 3).fit(data)
    cluster_assignment = db_default.labels_

    return cluster_assignment

In [18]:
def reduce_dims(data, alg='tsne', num_components=2):
    
    topics_red = None
    
    if alg == 'tsne':
        
        from sklearn.manifold import TSNE

        topics_red = TSNE(n_components=num_components).fit_transform(data)
        
    elif alg == 'pca':
        
        from sklearn.decomposition import PCA
        topics_red = PCA(n_components=num_components,svd_solver='full').fit_transform(data)
            
    return topics_red

In [19]:
def print_clusters(num_clusters, cluster_assignment, topics):
    
    for i in range(num_clusters):
        print()
        print(f'Cluster {i + 1} contains:')
        clust_sent = np.where(cluster_assignment == i)
        for k in clust_sent[0]:
            print(f'- {topics[k]}')

In [None]:
# K-Means

# Define number of clusters or auto estimate optimum using intertia
num_clusters = optimize_k(topics_embeddings)

# Reduce Dimentions using TSNE or PCA
topics_red = reduce_dims(topics_embeddings,alg='tsne',num_components=2)

# Apply K-Means Clustering
cluster_assignment = kmeans(num_clusters=num_clusters, data=topics_red) # data=topics_embeddings, topics_red

# Print clusters cohesively
print_clusters(num_clusters, cluster_assignment, topics)

In [None]:
# DBScan

# Define number of clusters to show (note number of clusters is automatically determined)
num_clusters = 75

# Reduce Dimentions using TSNE
topics_red = reduce_dims(topics_embeddings,alg='tsne',num_components=3)

# Apply DBScan Clustering
cluster_assignment = dbscan(data=topics_embeddings) # data=topics_embeddings, topics_red

# Print clusters cohesively
print_clusters(num_clusters, cluster_assignment, topics)