In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from textwrap import wrap
from textblob import TextBlob

from pytrends.request import TrendReq
from pytrends.exceptions import ResponseError

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS
from scipy.cluster.hierarchy import dendrogram

import re
import string
import time
from PIL import Image
from collections import Counter

import plotly.express as px

In [None]:
FIELD_NAME = 'title'

In [None]:
data = pd.read_csv(f'../data/prepared/{FIELD_NAME}/named_entity_extraction_stemmer.csv')
data

In [None]:
cv = CountVectorizer(analyzer='word')
cv_data = cv.fit_transform(data[FIELD_NAME])
dtm_data = pd.DataFrame(cv_data.toarray(), columns=cv.get_feature_names_out())
dtm_data.index = data.index
dtm_data.head(5)

In [None]:
word_freq_data_normalized = dtm_data.T
word_freq_data_normalized

In [None]:
word_freq_data = pd.DataFrame(word_freq_data_normalized.sum(axis=1).reset_index()).rename(columns={'index': 'word', 0: 'freq'})
word_freq_data

In [None]:
TRIM_QUANTILE = 0.7
word_freq_data[word_freq_data['freq'] >= word_freq_data['freq'].quantile(TRIM_QUANTILE)]

In [None]:
stopwords = list(word_freq_data[word_freq_data['freq'] < word_freq_data['freq'].quantile(TRIM_QUANTILE)]['word'])
vec_vocab = list(word_freq_data[word_freq_data['freq'] >= word_freq_data['freq'].quantile(TRIM_QUANTILE)]['word'])

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords, vocabulary=vec_vocab)
tfidf_vectorizer

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(data[FIELD_NAME])
tfidf_matrix

In [None]:
tfidf_data = pd.DataFrame(tfidf_matrix.toarray())
tfidf_data

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names

In [None]:
tfidf_data.columns = feature_names
tfidf_data

# Truncated SVD

### during the experiments following configurations were tried:

* 1000 components - 10 clusters

* 100 components - 10 clusters

* 50 components - 10 clusters

* 100 components - 5 clusters

* 100 components - 15 clusters

In [None]:
tsvd = TruncatedSVD(n_components=100, random_state=37)
vec_matrix_tsvd = tsvd.fit_transform(tfidf_matrix)
vec_matrix_tsvd

In [None]:
pd.DataFrame(vec_matrix_tsvd)

# Clustering

## KMeans

In [None]:
clst10 = KMeans(n_clusters=10, verbose=0, random_state=37)
clst10.fit(vec_matrix_tsvd)

In [None]:
data['label'] = clst10.labels_
data

In [None]:
labels_count_data = data[[FIELD_NAME, 'label']].groupby(['label', FIELD_NAME]).count()
labels_count_data

In [None]:
assign_ratio = labels_count_data.reset_index()['label'].value_counts() / labels_count_data.shape[0]
assign_ratio

In [None]:
assign_ratio.sum()

In [None]:
flat_labels_count_data = labels_count_data.reset_index()
for i in assign_ratio.index:
    flat_labels_count_data.loc[flat_labels_count_data['label'] == i, 'ratio'] = assign_ratio.loc[i]
flat_labels_count_data.groupby(['label', 'ratio', FIELD_NAME]).first(5)

In [None]:
def to_viz_data(data, tfidf_data, labels):
    transformed_data = []
    for label in labels:
        new_data = tfidf_data[tfidf_data[data['label'] == label].idxmax(axis=1)].max()[:200]
        transformed_data.append(pd.concat([new_data.reset_index(), pd.Series([label] * len(new_data))], axis=1))
    viz_data = pd.concat(transformed_data)
    viz_data.columns = ['word', 'tfidf', 'label']
    return viz_data

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

## Visualizing the results

In [None]:
viz_data.sort_values('label').groupby('label').head(10).reset_index(drop=True)

In [None]:
def visualize_clusters(viz_data):
    viz_data = viz_data.copy()
    viz_data['label'] += np.abs(viz_data['label'].min())
    viz_data = viz_data.sort_values('label').groupby('label').head(10)
    fig = px.scatter(viz_data, x='label', y='tfidf', color='label', text='word', hover_data=['label', 'tfidf', 'word'])
    fig.update_layout(
        font=dict(
            family='Courier New, monospace',
            size=18,  # Set the font size here
            color='black'
        ),
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 1
        ),
        yaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 0.05
        )
    )
    fig.update_traces(mode='text', selector=dict(type='scatter'), textfont_size=(viz_data['tfidf'] * 25).values)
    fig.for_each_trace(lambda t: t.update(textfont_color=np.array(px.colors.qualitative.Dark24)[t.marker.color], textposition='bottom center'))
    fig.show()

In [None]:
visualize_clusters(viz_data)

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

## Agglomerative (ward)

In [None]:
clst_agg_ward = AgglomerativeClustering(n_clusters=10, linkage='ward')
clst_agg_ward.fit(vec_matrix_tsvd)

In [None]:
data['label'] = clst_agg_ward.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
visualize_clusters(viz_data)

In [None]:
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='ward')
model.fit(vec_matrix_tsvd)
plt.figure(figsize=(18, 12))
plot_dendrogram(model, truncate_mode='level', p=5)
plt.xlabel('Dendogram agglomerative ward')
plt.show()

## Agglomerative (complete)

In [None]:
clst_agg_avg = AgglomerativeClustering(n_clusters=10, linkage='complete')
clst_agg_avg.fit(vec_matrix_tsvd)

In [None]:
data['label'] = clst_agg_avg.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
visualize_clusters(viz_data)

In [None]:
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='complete')
model.fit(vec_matrix_tsvd)
plt.figure(figsize=(18, 12))
plot_dendrogram(model, truncate_mode='level', p=5)
plt.xlabel('Dendogram agglomerative complete')
plt.show()

## DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5)
dbscan.fit(vec_matrix_tsvd)

In [None]:
data['label'] = dbscan.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
visualize_clusters(viz_data)

## OPTICS

In [None]:
optics = OPTICS(min_samples=25)
optics.fit(vec_matrix_tsvd)

In [None]:
data['label'] = optics.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
visualize_clusters(viz_data)