In [None]:
!pip install sentence-transformers

import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import torch


nltk.download('punkt')
sent_embedding_model = "all-MiniLM-L6-v2"
device_used = "cuda"

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
random_seed = 70
random.seed(random_seed)

torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [None]:
def split_sentence(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

def sent_embedding(paragraph):
    sentences = split_sentence(paragraph)
    if torch.cuda.is_available():
        model = SentenceTransformer(sent_embedding_model, device=device_used)
    else:
        model = SentenceTransformer(sent_embedding_model)
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings

In [None]:
embeddings = sent_embedding("Machine learning is part of data science. A movie theater, cinema, or cinema hall, also known as a movie house, picture house, picture theater or simply theater. Data science is part of the curriculum in many courses. The film is projected with a movie projector onto a large projection screen at the front of the auditorium while the dialogue, sounds and music are played through speakers. Movie theatres stand in a long tradition of theaters that could house all kinds of entertainment. we are doing NLP projeckt")

In [None]:
embeddings.shape

(6, 384)

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import numpy as np
from scipy.spatial.distance import cdist

def elbow(embed, means):
    score = cdist(embed, means)
    score = np.sum(np.min(score, axis=1))/embed.shape[0]
    return score

def silhouette(embed, classes):
    scores = silhouette_score(embed, classes, metric="euclidean")
    return scores

#GMM clustering
def clustering(sentence_embeddings):
    cluster_count = 2
    scores = []
    labels_array = []
    prob_array = []
    while(cluster_count<5):
        gmm = GaussianMixture(cluster_count)
        labels = gmm.fit_predict(sentence_embeddings) # sentence_embedding shape -> (num_samples, num_features)
        prob_array.append(gmm.predict_proba(sentence_embeddings))
        labels_array.append(labels)
        scores.append(silhouette(sentence_embeddings, labels))
        # print(scores)
        #scores.append(elbow(sentence_embeddings, gmm.means_))
        cluster_count+=1

    least = np.argmax(scores)

    return labels_array[least], prob_array

labels_arr, prob_arr = clustering(embeddings)

print(labels_arr)
print()
print(prob_arr)

[2 1 2 1 1 0]

[array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.]]), array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]]), array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])]
