In [1]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd

# KMeans Klassen

In [2]:
df = pd.read_csv('clusters/clusters - multi.csv')
vectors = df.iloc[:, 2:].to_numpy()
labels = df['true_label'].to_list()

In [29]:
# best parameter search

s_score = {}
for random_state in range(1000):
    for clusters in range(2, 10):
        kmeans = KMeans(n_clusters=clusters, random_state=random_state, n_init='auto').fit(vectors)
        s_score[(random_state, clusters)] = silhouette_score(vectors, kmeans.labels_, metric='cosine')

best_params = sorted(s_score.items(), key=lambda x: x[1], reverse=True)[0]
print(f'Best paramters: random_state: {best_params[0][0]}, number of clusters: {best_params[0][1]}, -> silhouette score: {best_params[1]}')

kmeans = KMeans(n_clusters=best_params[0][1], random_state=best_params[0][0], n_init='auto').fit(vectors)

Best paramters: random_state: 6, number of clusters: 3, -> silhouette score: 0.5903504901732399


In [28]:
groups = [[] for _ in range(len(kmeans.cluster_centers_))]
for i, label in enumerate(kmeans.labels_):
    groups[label].append(labels[i]) 

for i, group in enumerate(groups):
    print(f'{i}: {", ".join(group)}')

0: PMD:Methods, NIC:Intervention, NIC:Study Design, NIC:Population, NIC:Other, DRI:Approach, DRI:Futurework, ART:Goal, ART:Object, ART:Experiment, ART:Model, ART:Method
1: PMD:Results, PMD:Conclusions, NIC:Outcome, DRI:Outcome, ART:Hypothesis, ART:Observation, ART:Result, ART:Conclusion
2: PMD:Background, PMD:Objective, NIC:Background, DRI:Background, DRI:Challenge, ART:Background, ART:Motivation


# Arthurs Klassen

In [26]:
ART_LABELS_TO_GENERIC = {"mask": "mask", "Background": "Background", "Motivation": "Background", "Hypothesis": "Problem", "Goal": "Problem", "Object": "Problem", "Experiment": "Method", "Model": "Method", "Method": "Method", "Observation": "Result", "Result": "Result", "Conclusion": "Conclusion"}
DRI_LABELS_TO_GENERIC = {"mask": "mask", "Background": "Background", "Challenge": "Problem", "Approach": "Method", "Outcome": "Result", "FutureWork": "Future Work"}
PUBMED_LABELS_TO_GENERIC = {"mask": "mask", "BACKGROUND": "Background", "OBJECTIVE": "Problem", "METHODS":  "Method", "RESULTS": "Result", "CONCLUSIONS": "Conclusion"}
NICTA_LABELS_TO_GENERIC = {"mask": "mask", "background": "Background", "intervention": "Method", "study design": "Method", "population": "Method", "outcome": "Result", "other": "Method"}

classes = {}
datasets = [ART_LABELS_TO_GENERIC, DRI_LABELS_TO_GENERIC, PUBMED_LABELS_TO_GENERIC, NICTA_LABELS_TO_GENERIC]
names = ['ART', 'DRI', 'PMD', 'NIC']

for dataset, name in zip(datasets, names):
    for k, v in dataset.items():
        classes.setdefault(v, [])
        classes[v].append(name + ':' + k)

for i, cl in enumerate(classes.values()):
    print(f'{i}: {", ".join(cl)}')


0: ART:mask, DRI:mask, PMD:mask, NIC:mask
1: ART:Background, ART:Motivation, DRI:Background, PMD:BACKGROUND, NIC:background
2: ART:Hypothesis, ART:Goal, ART:Object, DRI:Challenge, PMD:OBJECTIVE
3: ART:Experiment, ART:Model, ART:Method, DRI:Approach, PMD:METHODS, NIC:intervention, NIC:study design, NIC:population, NIC:other
4: ART:Observation, ART:Result, DRI:Outcome, PMD:RESULTS, NIC:outcome
5: ART:Conclusion, PMD:CONCLUSIONS
6: DRI:FutureWork
