In [None]:
# installing the sentence-transformers library
#Itâ€™s widely used for sentence embeddings, semantic similarity, clustering, etc.
!pip install -U sentence-transformers -q

In [None]:
# to read and manipulate the data
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', None)    # setting column to the maximum column width as per the data

# to visualise data
import matplotlib.pyplot as plt
import seaborn as sns

# to compute distances
from scipy.spatial.distance import cdist, pdist
from sklearn.metrics import silhouette_score

# importing the PyTorch Deep Learning library
import torch

# to import the model
from sentence_transformers import SentenceTransformer

# to cluster the data
from sklearn.cluster import KMeans

# to compute metrics
from sklearn.metrics import classification_report

# to avoid displaying unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
reviews = pd.read_csv("/content/news_articles.csv")

In [None]:
data = reviews.copy()

In [None]:
data.loc[1,'Text']

In [None]:
data.head()

In [None]:
data.loc[3,'Text']

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data = data.drop_duplicates()

# resetting the dataframe index
data.reset_index(drop=True, inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
""" hf_xet is a helper package for enhancing file transfers with the Hugging Face Hub.
It integrates Rust-based code for efficient, chunk-based deduplication,
and caching when uploading or downloading large files"""
!pip install hf_xet

In [None]:
from sentence_transformers import SentenceTransformer
#Defining the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
# setting the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_matrix = model.encode(data['Text'], show_progress_bar=True, device=device)
embedding_matrix.shape

In [None]:
# encoding the dataset
embedding_matrix = model.encode(data['Text'], show_progress_bar=True, device=device)

In [None]:
# printing the shape of the embedding matrix
embedding_matrix.shape

In [None]:
# printing the embedding vector of the first review in the dataset
len(embedding_matrix[0])

In [None]:
a= "i love apple"
b= "apple is a fruit"
c= "i like this table"


In [None]:
# defining a function to compute the cosine similarity between two embedding vectors
def cosine_score(text1,text2):
    # encoding the text
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)

    # calculating the L2 norm of the embedding vector
    norm1 = np.linalg.norm(embeddings1)
    norm2 = np.linalg.norm(embeddings2)

    # computing the cosine similarity
    cosine_similarity_score = ((np.dot(embeddings1,embeddings2))/(norm1*norm2))

    return cosine_similarity_score

In [None]:
print(cosine_score(a,b))
print(cosine_score(b,c))
print(cosine_score(a,c))

In [None]:
# ussing prebuilt method
from sentence_transformers import util

embeddings1 = model.encode(a)
embeddings2 = model.encode(b)
embeddings3 = model.encode(c)

print(util.cos_sim(embeddings1, embeddings2))
print(util.cos_sim(embeddings2, embeddings3))
print(util.cos_sim(embeddings1, embeddings3))

In [None]:
# defining a function to find the top k similar sentences for a given query
def top_k_similar_sentences(embedding_matrix, query_text, k):
    # encoding the query text
    query_embedding = model.encode(query_text)

    # calculating the cosine similarity between the query vector and all other encoded vectors of our dataset
    score_vector = np.dot(embedding_matrix,query_embedding)

    # sorting the scores in descending order and choosing the first k
    top_k_indices = np.argsort(score_vector)[::-1][:k]

    # returning the corresponding reviews
    return data.loc[list(top_k_indices), 'Text']

In [None]:
# defining the query text
query_text = "Budget for elections"

# displaying the top 3 similar sentences
top_k_reviews = top_k_similar_sentences(embedding_matrix, query_text, 3)

for i in top_k_reviews:
    print(i, end="\n")
    print("*******************************************************************")
    print("\n")

In [None]:
# defining the query text
query_text = "High imports and exports"

# displaying the top 3 similar sentences
top_k_reviews = top_k_similar_sentences(embedding_matrix, query_text, 3)

for i in top_k_reviews:
    print(i, end="\n")
    print("*******************************************************************")
    print("\n")

In [None]:
meanDistortions = []
clusters = range(2, 11)

for k in clusters:
    clusterer = KMeans(n_clusters=k, random_state=1)
    clusterer.fit(embedding_matrix)

    prediction = clusterer.predict(embedding_matrix)

    distortion = sum(
        np.min(cdist(embedding_matrix, clusterer.cluster_centers_, "euclidean"), axis=1) ** 2
    )
    meanDistortions.append(distortion)

    print("Number of Clusters:", k, "\tAverage Distortion:", distortion)

In [None]:
plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average Distortion")
plt.title("Selecting k with the Elbow Method", fontsize=20)
plt.show()

In [None]:
sil_score = []
cluster_list = range(2, 10)

for n_clusters in cluster_list:
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)

    preds = clusterer.fit_predict((embedding_matrix))

    score = silhouette_score(embedding_matrix, preds)
    sil_score.append(score)

    print("For n_clusters = {}, the silhouette score is {})".format(n_clusters, score))

In [None]:
plt.plot(cluster_list, sil_score, "bx-")
plt.show()

In [None]:
# defining the number of clusters/categories
n_categories = 5

# fitting the model
kmeans = KMeans(n_clusters=n_categories, random_state=1).fit(embedding_matrix)

In [None]:
# checking the cluster centers
centers = kmeans.cluster_centers_
centers

In [None]:
# creating a copy of the data
clustered_data = data.copy()

# assigning the cluster/category labels
clustered_data['Category'] = kmeans.labels_

clustered_data.head()

In [None]:
# for each cluster, printing the 5 random news articles
for i in range(5):
    print("CLUSTER",i)
    print(clustered_data.loc[clustered_data.Category == i, 'Text'].sample(5, random_state=1).values)
    print("*****************************************************************")
    print("\n")

In [None]:
# dictionary of cluster label to category
category_dict = {
    0: 'Sports',
    1: 'Politics',
    2: 'Entertainment',
    3: 'Business',
    4: 'Technology'
}
# mapping cluster labels to categories
clustered_data['Category'] = clustered_data['Category'].map(category_dict)

clustered_data.head()

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# loading the actual labels
labels = pd.read_csv("news_article_labels.csv")

In [None]:
labels.shape

In [None]:
# checking the unique labels
labels['Label'].unique()

In [None]:
# adding the actual categories to our dataframe
clustered_data['Actual Category'] = labels['Label'].values

In [None]:
print(classification_report(clustered_data['Actual Category'], clustered_data['Category']))

In [None]:
# creating a dataframe of incorrect categorizations
incorrect_category_data = clustered_data[clustered_data['Actual Category'] != clustered_data['Category']].copy()
incorrect_category_data.shape

In [None]:
incorrect_category_data.head()

In [None]:
idx = 24

print('Distance from Actual Category')
print(cdist(embedding_matrix[idx].reshape(1,-1), kmeans.cluster_centers_[[2]], "euclidean")[0,0])

print('Distance from Predicted Category')
print(cdist(embedding_matrix[idx].reshape(1,-1), kmeans.cluster_centers_[[3]], "euclidean")[0,0])

In [None]:
idx = 45

print('Distance from Actual Category')
print(cdist(embedding_matrix[idx].reshape(1,-1), kmeans.cluster_centers_[[2]], "euclidean")[0,0])

print('Distance from Predicted Category')
print(cdist(embedding_matrix[idx].reshape(1,-1), kmeans.cluster_centers_[[4]], "euclidean")[0,0])