# Group 6 Project

In [None]:
from datasets import load_dataset
import plotly.express as px
from itertools import chain
from collections import Counter, OrderedDict
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset  (run this in console, with venv activated)

dataset = load_dataset("mteb/twentynewsgroups-clustering", revision="6125ec4e24fa026cec8a478383ee943acfbd5449")
print(dataset)

In [None]:
print(dataset.keys())

### Extracting variables:

In [None]:
test = dataset["test"]
texts_sep = list(test["sentences"])
labels_sep = list(test["labels"])

## Understanding the Dataset

In [None]:
print(test[0].keys())
print(len(test))

print(test[0]["sentences"][0], '=>', test[0]["labels"][0])
print(test[0]["sentences"][1], '=>', test[0]["labels"][1])
print(test[0]["sentences"][2], '=>', test[0]["labels"][2])

# each sentence and which group it belongs to

In [None]:
total = set()
for i in range(len(test)):
    total = total.union(set(test[i]["labels"]))
total  # there are around 20 groups

In [None]:
texts = []
for text in texts_sep:
    texts.extend(text)
print("number of texts: ", len(texts))

In [None]:
labels_list = list(chain.from_iterable(labels_sep))
label_counts = OrderedDict(sorted(Counter(labels_list).items()))
len(labels_list)  # total records in general

In [None]:
px.bar(x=[str(_) for _ in label_counts.keys()], y=label_counts.values(), labels={"x": "labels", "y": "count of appearance"})

In [None]:
get_from = lambda x: [texts[i] for i, val in enumerate(labels_list) if val == x]
get_word_cloud = lambda x: WordCloud(stopwords=STOPWORDS, max_words=100).generate(x)

In [None]:
# before preprocessing

cloud = get_word_cloud(" ".join(texts))
plt.title("Dataset (before preprocessing)")
plt.axis("off")
plt.imshow(cloud)

In [None]:
fig, _axs = plt.subplots(nrows=6, ncols=2, figsize=(16, 16))
fig.subplots_adjust(hspace=0.3)
axs = _axs.flatten()

for _ in range(12):
    axs[_].imshow(get_word_cloud(" ".join(get_from(_))))
    axs[_].set_title(f"Label: {_}")
    axs[_].axis("off")
fig.tight_layout()
plt.show()

## Preprocessing

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts)

In [None]:
print(embeddings.shape)
print(embeddings[0][0:20])

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2 ** 6)  #64
features = pca.fit_transform(embeddings)

In [None]:
print(features.shape)

In [None]:
test_features = features[0:10000]
print(test_features.shape)
print(test_features[9])

In [None]:
from src.DBScan import DBScan

scanner =  DBScan(radius=0.6, min_dense=10)
clusters = scanner.make_clusters(features)

In [None]:
import numpy as np
print("Number of clusters produced: ", np.unique(clusters).size)
print(len(np.where(clusters==0)))

### Using the DBScan elbow method

In [None]:
import matplotlib.pyplot as plt

# Don't be crazy running this, i ran it for 70 min and it did not end hahah
# Just for show

k_nearest_neigbours = 10
all_distances = []

for point in features:
    distances = []
    for other_point in features:
        if np.array_equal(point, other_point):
            continue
        distance = np.linalg.norm(point - other_point)
        distances.append(distance)
    distances.sort()
    all_distances.append(distances[k_nearest_neigbours - 1])  

sorted_distances = sorted(all_distances)
k_distance_plot = plt.plot(sorted_distances)

plt.xlabel("Points sorted by distance to their 10 nearest neighboors")
plt.ylabel("Distance to nearest neighboor")
plt.title("K-distance Plot")

plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

k_nearest_neigbours = 10 

k_nearest_neigbours = NearestNeighbors(n_neighbors=k_nearest_neigbours + 1)  
k_nearest_neigbours.fit(features)

distances, indices = k_nearest_neigbours.kneighbors(features)

distances = distances[:, -1]            
distances_sorted = np.sort(distances)

plt.plot(distances_sorted)

plt.xlabel("Points sorted by distance to their 15 nearest neighboors")
plt.ylabel("Distance to nearest neighboor")
plt.title("K-distance Plot")

plt.show()