# Inspectable execution of cluster metrics

In [1]:
import pandas as pd
import sklearn as sk

from pv_evaluation.metrics.cluster import (
    cluster_precision,
    cluster_recall,
    cluster_fscore,
    cluster_fowlkes_mallows,
    cluster_homogeneity,
    cluster_completeness,
    cluster_v_measure,
    clusters_count
)

## Test data

In [1]:
data1 = (
    pd.read_csv("tests/resources/raw_inventor_sample.tsv", sep="\t")
    .set_index("mention_id")
    .dropna()
)

data1

NameError: name 'pd' is not defined

In [3]:
prediction = pd.Series([0, 0, 1, 1, 1, 2, 3, 4, 4, 5, 6 ,7, 8, 8, 8, 8, 9, 0, 11, 11])
reference = pd.Series([0, 0, 1, 1, 1, 2, 2, 2, 3])

data2 = pd.concat({"prediction":prediction, "reference":reference}, axis=1, join="inner")

In [4]:
ARGS = [(data1.name_full, data1.inventor_id),
        (prediction, reference),
        (data2.prediction, data2.reference)]

## Cluster metrics tests

### Precision

In [5]:
for args in ARGS:
    print(cluster_precision(*args))

0.9621477251795911
0.36363636363636365
0.8


### Recall

In [6]:
for args in ARGS:
    print(cluster_recall(*args))

0.8660184237461617
0.75
0.75


### f-score

In [7]:
for args in ARGS:
    print(cluster_fscore(*args, beta=1))

0.9115557214103355
0.48979591836734687
0.7741935483870969


### Fowlkes-Mallow

In [8]:
for args in ARGS:
    print(cluster_fowlkes_mallows(*args))

0.9128185232405096
0.5222329678670935
0.7745966692414834


### Cluster homogeneity

In [9]:
for args in ARGS:
    print(cluster_homogeneity(*args))

0.9922090512260598
0.8824880799443542
0.8824880799443542


### Cluster completeness

In [10]:
for args in ARGS:
    print(cluster_completeness(*args))

0.9790038456879436
0.7595437291421123
0.7595437291421123


### Cluster v-measure

In [11]:
for args in ARGS:
    print(cluster_v_measure(*args))

0.9855622174524317
0.8164132795177796
0.8164132795177796


### Clusters count

In [12]:
for args in ARGS:
    print(clusters_count(args[0]))

10858
11
5
