In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [2]:
def similarity(s1:str, s2:str, model_name='paraphrase-mpnet-base-v2'):
    
    model = SentenceTransformer(model_name)
    
    # encode sentences to get their embeddings
    embedding1 = model.encode(s1, convert_to_tensor=True)
    embedding2 = model.encode(s2, convert_to_tensor=True)
    
    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    
    return cosine_scores.item()

In [3]:
def topic(s1:str):
    # Returns the topic extracted from the entire output.
    
    return s1[:s1.find('.')]

In [4]:
def compare_similarity(s1:str, s2:str):
    
    print(f'String 1 : {s1}')
    print(f'String 2 : {s2}\n')
    
    print('Similarity :',similarity(s1,s2))

In [5]:
txtfile = open('../data/I2_1_1000.txt','r')
biz_ideas = [line.rstrip('\n') for line in txtfile]

In [6]:
for i in range(50):
    print(i,topic(biz_ideas[i]))

0 Home Service
1 Custom T-shirt printing
2 A beauty salon
3 Dry Cleaning
4 Con-crete Company
5 Custom Paint shop
6 Personal Care and Service
7 Bikes Rental Shop
8 Food Truck
9 Cleaning service
10 Pizza delivery and catering
11 Barbershops
12 Pet Sitting Business
13 Concrete Cutting and Grinding business
14 Auto Parts Depot
15 Bar
16 Auto shop
17 Beauty salon
18 Pet Sitting Business
19 Golf Course
20 Home Maintenance for Rent
21 Beauty Shop
22 Dental Care
23 Convenience Store
24 Bakery
25 Restaurant
26 Hair and Make-Up Salon
27 Bakery
28 Carpet and T
29 Real Estate
30 Photography Studios
31 Beauty salons
32 Beauty Salon
33 Web design Company
34 Small Retail businesses
35 Video Poker Business
36 Cleaning Supplies
37 Car Wash
38 Personal care services
39 Internet start-up
40 Jewelry Business
41 Pet Sitting
42 Restaurant
43 Construction business
44 Online business
45 Dining Out
46 Food and beverage store
47 Auto body repair
48 Video Productions
49 Cleaner/Dry cleaner


In [7]:
compare_similarity(topic(biz_ideas[13]), topic(biz_ideas[4]))

String 1 : Concrete Cutting and Grinding business
String 2 : Con-crete Company

Similarity : 0.2693554759025574


In [8]:
# Create a list of topics from the corpus

topics = []

for idea in biz_ideas:

    topics.append(topic(idea))

[print(topic_) for topic_ in topics[:10]]

Home Service
Custom T-shirt printing
A beauty salon
Dry Cleaning
Con-crete Company
Custom Paint shop
Personal Care and Service
Bikes Rental Shop
Food Truck
Cleaning service


[None, None, None, None, None, None, None, None, None, None]

In [9]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')
topics_embeddings = model.encode(topics)

In [10]:
topics_embeddings.shape

(1000, 768)

In [11]:
# Top K similar ideas

topic_query = 'Beauty Parlor'
query_embedding = model.encode(topic_query)

top_k = 100

cos_scores = util.pytorch_cos_sim(query_embedding, topics_embeddings)[0]
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

print("Sentence:", topic_query, "\n")
print(f'Top {top_k} most similar sentences in corpus:')

for idx in top_results[0:top_k]:
    print(topics[idx], "(Score: %.4f)" % (cos_scores[idx]))

Sentence: Beauty Parlor 

Top 100 most similar sentences in corpus:
Beauty salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
Beauty Salon (Score: 0.9091)
A beauty salon (Score: 0.9033)
A Beauty Salon (Score: 0.9033)
Beauty Shop (Score: 0.8997)
Beauty Shop (Score: 0.8997)
Beauty salons (Score: 0.8859)
Hair Care Salon (Score: 0.7918)
A Hair Salon (Score: 0.7792)
A Hair Salon (Score: 0.7792)
A Hair Salon (Score: 0.7792)
Hair and Beauty salons (Score: 0.7688)
Hairdressing salon (Score: 0.7607)
Hairdressing Salon (Score: 0.7607)
Hair and Grooming Salon (Score: 0.7514)
Hair and Nail Salon (Score: 0.7510)
Hair Salons (Score: 0.7441)
Hair Salons (Score: 0.7441)
Hair Salon (Score: 0.7420)
Hair Salon (Score: 0.7420)
Hair Salon (Score: 0.7420)
Hair Salon (Score: 0.7420)
Hair Salon (Score: 0.7420)

In [12]:
# Clustering : KMeans

def kmeans(num_clusters, data:list):
    
    from sklearn.cluster import KMeans
        
    clustering_model = KMeans(n_clusters = num_clusters)
    clustering_model.fit(data)
    cluster_assignment = clustering_model.labels_

    return cluster_assignment

In [13]:
def optimize_k(data:list):
    
    from sklearn.cluster import KMeans
    import math
    
    dists = []
    K = range(1,70)
        
    for n in K:
        k_model = KMeans(n_clusters = n)
        k_model.fit(data)
        dists.append(k_model.inertia_)
        
    def calc_dist(x1,y1,a,b,c):
        return abs((a*x1 + b*y1 + c))/(math.sqrt(a**2 + b**2))
        
    a = dists[0] - dists[-1]
    b = K[-1] - K[0]
    c1 = K[0] * dists[-1]
    c2 = K[-1] * dists[0]
    c = c1 - c2
        
    dists_line = []

    for k in range(K[-1]):
        dists_line.append(calc_dist(K[k], dists[k], a, b, c))
            
    num_clusters = dists_line.index(max(dists_line))+1
        
    return num_clusters

In [14]:
# Alternative Approach: DBSCAN (Does not seem to work well with semantics)

def dbscan(data:list):
    
    from sklearn.cluster import DBSCAN

    db_default = DBSCAN(eps = 0.0375, min_samples = 3).fit(data)
    cluster_assignment = db_default.labels_

    return cluster_assignment

In [15]:
def reduce_dims(data, alg='tsne', num_components=2):
    
    topics_red = None
    
    if alg == 'tsne':
        
        from sklearn.manifold import TSNE

        topics_red = TSNE(n_components=num_components).fit_transform(data)
    
    return topics_red

In [16]:
def print_clusters(num_clusters, cluster_assignment, topics):
    
    for i in range(num_clusters):
        print()
        print(f'Cluster {i + 1} contains:')
        clust_sent = np.where(cluster_assignment == i)
        for k in clust_sent[0]:
            print(f'- {topics[k]}')

In [17]:
# K-Means

# Define number of clusters or auto estimate optimum using intertia
num_clusters = optimize_k(topics_embeddings)

# Reduce Dimentions using TSNE
#topics_tsne = reduce_dims(topics_embeddings,alg='tsne',num_components=3)

# Apply K-Means Clustering
cluster_assignment = kmeans(num_clusters=num_clusters, data=topics_embeddings) # data=topics_embeddings, topics_tsne

# Print clusters cohesively
print_clusters(num_clusters, cluster_assignment, topics)


Cluster 1 contains:
- Custom Paint shop
- Concrete Cutting and Grinding business
- Bar
- Carpet and T
- Construction business
- Bar
- Funerary Home
- Construction and demolition
- Painting business
- Painting Businesses
- Painting and Decorating
- Painting Shop
- Painting and Drywall
- Painting Shop
- Electricians
- House Painting
- Electrician
- Home Maintenance Contractor
- Concrete
- Painting and Carpentry
- Painting and Decorating
- Painting
- Painting
- Electrician
- Gas Grill
- Painting and Decorating
- Construction company
- Painting/Carpentry workshop
- Painting Shop
- Carpet & Floor Covering
- Custom Paint Services
- Painting and Tiling
- Home remodeling company
- Painting
- Dumpster Rental
- B&B House
- Construction supplies
- Painting and Decorating
- Painting service
- Construction
- Construction company
- Construction services
- Bar
- Concrete and Masonry services
- House Painting
- Bunkhouse
- Painting Services
- Construction and Landscaping
- Painting Company
- Electric

In [18]:
# DBScan

# Define number of clusters to show (note number of clusters is automatically determined)
num_clusters = 75

# Reduce Dimentions using TSNE
#topics_tsne = reduce_dims(topics_embeddings,alg='tsne',num_components=3)

# Apply DBScan Clustering
cluster_assignment = dbscan(data=topics_embeddings)

# Print clusters cohesively
print_clusters(num_clusters, cluster_assignment, topics)


Cluster 1 contains:
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry Cleaning
- Dry cleaning
- Dry Cleaning
- Dry Cleaning

Cluster 2 contains:
- Food Truck
- Food truck
- Food Truck
- Food Truck
- Food Truck

Cluster 3 contains:
- Cleaning service
- Cleaning service
- Cleaning service

Cluster 4 contains:
- Pet Sitting Business
- Pet Sitting Business
- Pet Sitting Business

Cluster 5 contains:
- Bar
- Bar
- Bar

Cluster 6 contains:
- Auto shop
- Auto Shop
- Auto Shop

Cluster 7 contains:
- Beauty salon
- Beauty Salon
- Beauty salon
- Beauty Salon
- Beauty Salon
- Beauty Salon
- Beauty Salon
- Beauty Salon
- Beauty Salon
- Beauty Salon

Cluster 8 contains:
- Dental Care
- Dental Care
- Dental Care

Cluster 9 contains:
- Convenience Store
- Convenience Store
- Convenience store
- Convenience Store
- Convenience Store
- Convenience store
- Convenience Store

Cluster 10 contains:
- Bakery
- Bakery
- Bakery
- Bak