# Notebook 4.3 - Actors curation: duplicates

This notebook gather several checks that can be run together or independently of each other. The set of these checks helps moderators to curate Duplicated actors in the SSH Open Marketplace. 

This notebook is composed of 6 sections:

0. Requirements to run this notebook
1. Get actors 
2. Duplicated actors 
2.1 Get duplicates for actors
    2.2 Compare duplicated actors
    2.3 Merge duplicated actors


## 0 Requirements to run this notebook

This section gives all the relevant information to "interact" with the MP data.

### 0.1 libraries
*There are a number of external libraries needed to run the notebook* 

*Furthermore, a dedicated SSH Open Marketplace library - sshmarketplacelib - with customised functions has been created and can be imported using the python import commands.* 

*Below the libraries import needed to run this notebook*

In [None]:
import pandas as pd #to manage dataframes
#import matplotlib.pyplot as plt #to create histograms and images
#import seaborn as sns #to create histograms and images
import numpy as np #to manage json objects
#import the MarketPlace Library 
from sshmarketplacelib import MPData as mpd
from sshmarketplacelib import  eval as eva, helper as hel

In [None]:
mpdata = mpd()
utils=hel.Util()
check=eva.URLCheck()

In [None]:
df_tool_flat =mpdata.getMPItems ("toolsandservices", True)
df_publication_flat =mpdata.getMPItems ("publications", True)
df_trainingmaterials_flat =mpdata.getMPItems ("trainingmaterials", True)
df_workflows_flat =mpdata.getMPItems ("workflows", True)
df_datasets_flat =mpdata.getMPItems ("datasets", True)

## 1. Get actors

In [None]:
df_actors_flat =mpdata.getMPItems ("actors", True)

In [None]:
df_actors_flat.tail()

## 2. Duplicated actors
    2.1 Get duplicates for actors using *actor.name* and *actor.website* as filters
    2.2 Compare duplicated actors (optional)
    2.4 Merge duplicated actors

### 2.1 Get duplicates for actors using *actor.name* and *actor.website* as filter

In [None]:
utils=hel.Util()
filter_attribute='name, website'
df_actor_duplicates=utils.getDuplicates(df_actors_flat, filter_attribute)
dupl_actor_website=df_actor_duplicates[df_actor_duplicates['website'].notnull()].sort_values('name')

In [None]:
print (f'Using the attributes "{filter_attribute}" as filter, there are: {dupl_actor_website.shape[0]} duplicated actors')

In [None]:
actorwebsite_tomerge=dupl_actor_website.groupby(['name','website'])['id'].apply(list).reset_index(name='idstobemerged')

In [None]:
actorwebsite_tomerge.count()

In [None]:
#The number of actors with more than one duplicate
actorwebsite_tomerge[actorwebsite_tomerge.idstobemerged.map(len)>2].count()

#### _Individuating actors with the exact same name that were never attached to any items, they could be merged without individual investigation. Example: 1665 and 3210_

In [None]:
df_actors_empty_items=pd.DataFrame()
for item in actorwebsite_tomerge.itertuples():
    #print(item.idstobemerged)
    allEmpty=True;
    for actorid in item.idstobemerged:
        actitems=mpdata.getItemsforActor(str(actorid))
        allEmpty=allEmpty & actitems.empty
    if (allEmpty):
        #print (f'No attached item(s) for: {item.name}, ids to be merged: {item.idstobemerged}')
        entry = actorwebsite_tomerge.loc[actorwebsite_tomerge['name'] == item.name]
        df_actors_empty_items=pd.concat([df_actors_empty_items, entry])

In [None]:
df_actors_empty_items.count()

In [None]:
dfr=pd.merge(actorwebsite_tomerge,df_actors_empty_items,on=['name', 'website'], how="outer",indicator=True)
dfr=dfr[dfr['_merge']=='left_only']
dfr.rename(columns = {'idstobemerged_x': 'idstobemerged'}, inplace=True)
dfr[['name', 'website', 'idstobemerged']].head()

#### _Different actors with the exact same name that were never attached to the same items => comparison step and further investigation needed before deciding if merging or not._

In [None]:
df_actors_with_different_items=pd.DataFrame()
for item in actorwebsite_tomerge.itertuples():  
    if (len (item.idstobemerged)<2):
        continue
    tempdf_sn=mpdata.getItemsforActor(str(item.idstobemerged[0])).drop_duplicates('persistentId', keep='first')
    for actorid in item.idstobemerged[1:]:
        actitems=mpdata.getItemsforActor(str(actorid)).drop_duplicates('persistentId', keep='first')
        entry = actorwebsite_tomerge.loc[actorwebsite_tomerge['name'] == item.name]
        if (tempdf_sn.empty & (not actitems.empty)):
            df_actors_with_different_items=pd.concat([df_actors_with_different_items, entry])
            break 
        if ((not actitems.empty)):
            tre=actitems['persistentId'].isin(tempdf_sn['persistentId']).value_counts()
            if (False in tre.to_dict()):
                df_actors_with_different_items=pd.concat([df_actors_with_different_items, entry])
                break


In [None]:
df_actors_with_different_items.head()

#### _Individuating actors with the exact same name that were never attached to any items, they could be merged without individual investigation. Example: 1665 and 3210_

In [None]:
dfni=pd.merge(actorwebsite_tomerge,df_actors_empty_items,on=['name', 'website'], how="outer",indicator=True)
dfni=dfni[dfni['_merge']=='left_only']

df_actors_with_same_items=pd.DataFrame()

df_actors_with_same_items=pd.merge(dfni[['name','website','idstobemerged_x']],df_actors_with_different_items,on=['name', 'website'], how="outer",indicator=True)
df_actors_with_same_items=df_actors_with_same_items[df_actors_with_same_items['_merge']=='left_only']
df_actors_with_same_items=df_actors_with_same_items[['name', 'website','idstobemerged_x']]
df_actors_with_same_items.rename(columns = {'idstobemerged_x': 'idstobemerged'}, inplace=True)

df_actors_with_same_items=df_actors_with_same_items[['name', 'website', 'idstobemerged']]
df_actors_with_same_items.head()
#df_actors_with_same_items[['name','website']].isin(df_actors_empty_items[['name','website']]).value_counts()

### 2.2 Compare duplicated actors

In [None]:
#id of duplicated actors
ids=[1665, 3210]
compareitems=df_actor_duplicates[df_actor_duplicates.id.isin(ids)]

In [None]:
compareitems

In [None]:
css_equal="font-size:1.5rem; border: 2px solid silver;background-color: white; padding: 10px 20px"
css_diff="background-color: lightyellow;  font-size:1.5rem; border: 2px solid silver; padding: 10px 20px"

In [None]:
#view items
showdiff = compareitems.T.style.apply(lambda x: [css_equal if ((len(utils.lists_to_list(x.values))==1) ) else css_diff for i in x],
                    axis=1)
showdiff

### 2.3 Merge items

POST /api/actors/{id}/merge


In [None]:
#mpdata.postMergedActors('2505', '2266')
for item in actorwebsite_tomerge.itertuples():
    print(item.idstobemerged[0], item.idstobemerged[1])
    mpdata.postMergedActors(str(item.idstobemerged[0]), str(item.idstobemerged[1:]))