# Clustering

## Fitting two models and then pick the best model based on the Davies Bouldin index

In [1]:
import pandas as pd

In [2]:
# Load the data frame
tfidf_matrix= pd.read_csv('tf_idf.csv')
# Drop redundant index column
tfidf_matrix.drop('Unnamed: 0', axis =1, inplace = True)
tfidf_matrix

Unnamed: 0,10,3d,account,accuracy,action,addition,agent,ai,algebra,algorithm,...,via,wave,way,weak,well,within,without,work,world,zero
0,0.000000,0.0,0.000000,0.120878,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.085752,0.0,0.000000,0.000000,0.0,0.0
1,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.326318,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.097766,0.0,0.0
2,0.200543,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.077948,0.0,0.000000,0.000000,0.0,0.0
3,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
4,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431230,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.212551,0.000000,0.0,0.0
2431231,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
2431232,0.000000,0.0,0.092492,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
2431233,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0


In [3]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import davies_bouldin_score

# Optimal number of clusters from silhouette score and the elbow method
n_clusters = 13

In [4]:
# Initialize KMeans
kmeans = KMeans(init='k-means++' , n_clusters=n_clusters, random_state=42)
kmeans_clusters = kmeans.fit_predict(tfidf_matrix)



In [5]:
""" MiniBatchKMeans is an alternative for KMeans used for massive datasets
"""
# Initialize MiniBatchKMeans
minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
minibatch_kmeans_clusters = minibatch_kmeans.fit_predict(tfidf_matrix)



In [6]:
# Calculate Davies Bouldin Index for each model
db_score_kmeans = davies_bouldin_score(tfidf_matrix, kmeans_clusters)
db_score_minibatch_kmeans = davies_bouldin_score(tfidf_matrix, minibatch_kmeans_clusters)

print("Davies Bouldin Index:")
print("KMeans:", db_score_kmeans)
print("Davies Bouldin Index (MiniBatchKMeans):", db_score_minibatch_kmeans)

Davies Bouldin Index:
KMeans: 5.3309853037014765
Davies Bouldin Index (MiniBatchKMeans): 5.732249244927979


#### Based on the Davies Bouldin Index KMeans is better than MiniBatchKMeans

In [7]:
df = pd.read_csv('df_cleaned2.csv')

# Drop redundant index column
df.drop('Unnamed: 0', axis =1, inplace = True)

df

Unnamed: 0,cleaned_title,cleaned_abstract,cleaned_categories
0,calculation prompt diphoton production cross s...,fully differential calculation perturbative qu...,hep-ph
1,sparsity-certifying graph decomposition,describe new algorithm k ell pebble game...,math.co cs.cg
2,evolution earth-moon system based dark matter ...,evolution earth-moon system described dark mat...,physics.gen-ph
3,determinant stirling cycle number count unlabe...,show determinant stirling cycle number count u...,math.co
4,dyadic lambda alpha lambda alpha,paper show compute lambda alpha norm alp...,math.ca math.fa
...,...,...,...
2431230,origin irreversibility line thin ybacuo7 film ...,report measurement angular dependence irrevers...,supr-con cond-mat.supr-con
2431231,nonlinear response htsc thin film microwave re...,non-linear microwave surface impedance pattern...,supr-con cond-mat.supr-con
2431232,critical state flux penetration linear microwa...,vortex contribution dc field h dependent mic...,supr-con cond-mat.supr-con
2431233,density state nmr relaxation rate anisotropic ...,show density state anisotropic superconductor ...,supr-con cond-mat.supr-con


In [8]:
# Clusters
kmeans_clusters = kmeans.labels_

# Add the cluster labels to the DataFrames
df['KMeans_Cluster'] = kmeans_clusters
tfidf_matrix['KMeans_Cluster'] = kmeans_clusters

In [9]:
df

Unnamed: 0,cleaned_title,cleaned_abstract,cleaned_categories,KMeans_Cluster
0,calculation prompt diphoton production cross s...,fully differential calculation perturbative qu...,hep-ph,3
1,sparsity-certifying graph decomposition,describe new algorithm k ell pebble game...,math.co cs.cg,2
2,evolution earth-moon system based dark matter ...,evolution earth-moon system described dark mat...,physics.gen-ph,3
3,determinant stirling cycle number count unlabe...,show determinant stirling cycle number count u...,math.co,0
4,dyadic lambda alpha lambda alpha,paper show compute lambda alpha norm alp...,math.ca math.fa,0
...,...,...,...,...
2431230,origin irreversibility line thin ybacuo7 film ...,report measurement angular dependence irrevers...,supr-con cond-mat.supr-con,4
2431231,nonlinear response htsc thin film microwave re...,non-linear microwave surface impedance pattern...,supr-con cond-mat.supr-con,9
2431232,critical state flux penetration linear microwa...,vortex contribution dc field h dependent mic...,supr-con cond-mat.supr-con,4
2431233,density state nmr relaxation rate anisotropic ...,show density state anisotropic superconductor ...,supr-con cond-mat.supr-con,4


In [10]:
# Save the clustered datasets
df.to_csv('df_clustered.csv')
tfidf_matrix.to_csv('tfidf_matrix_clustered.csv')