In [1]:
########### Zotero Tag Cleaner: The Regular Clean ###########


from pyzotero import zotero

import time
import re
import requests
import pandas as pd
from io import BytesIO
pd.set_option('display.max_rows', 1000)



## Set up zotero API


# Add Library ID and Key here. If not a group library, change library type accordingly
library_id = "----"
api_key = "----" 

library_type = "group"

zot = zotero.Zotero(library_id, library_type, api_key)

In [None]:

### Get Tags ###

In [17]:
        
tags_df = zot.everything(zot.tags())

tags_df = pd.DataFrame(pd.unique(tags_df))  # account for duplicates due to zotero bugs with unique()

tags_df.rename(columns = {0:'Tag'}, inplace=True)


# not in whitelist
# Here, a whitelist of approved tags of our research group is used. Use your own whitelist instead!
# to bypass the google oauth process, the link from the browser window can be copied if sharing is activated

sharing_link = "https://docs.google.com/spreadsheets/d/1WfS1IsmEfS1k9l1fUcxAwz_Jv5o_N47htSFd8upycZ4/edit#gid=0
export_link = sharing_link.replace('/edit#gid=', '/export?format=csv&gid=')
r = requests.get(export_link)
data = r.content

df = pd.read_csv(BytesIO(data))
    
whitelist = df["Keyword"].str.strip()
whitelist = whitelist[~whitelist.str.contains("Generic")].reset_index(drop=True)

tags_df = tags_df[~tags_df.Tag.isin(whitelist)]


# not in generica list
# a list of allowed generica is used here. Such a list will be generated further down for future cleans
generica = pd.read_csv("Digidem_Generica_List_20190612.csv", names = "g", header = None)

tags_df = tags_df[~tags_df.Tag.isin(generica.g)].reset_index(drop=True)

print(tags_df)

                                                   Tag
0                                              Kittler
1                                                 Rosa
2                                                Keane
3                                              Gauchet
4                                          Rosanvallon
5                                   Fundamental Rights
6                                      user experience
7                                    content diversity
8                                    longitudinal data
9                                          Filterblase
10                                         Grundrechte
11                                          censorship
12                                      intermediaries
13                                   stereotype threat
14                                          censorware
15                                             twitter
16                                       co-regulation
17        

In [11]:

### Make tags_replacement DF ###

In [18]:
# 1) replacements for upper/lower case typos

whitelist_lower = {"Replacement":whitelist, "lower":whitelist.str.lower()}
whitelist_lower_df = pd.DataFrame(whitelist_lower)

tags_lower = {"Tag":tags_df["Tag"], 
              "lower":tags_df["Tag"].str.lower()}
tags_lower_df = pd.DataFrame(tags_lower)

tags_replacement = pd.merge(tags_lower_df, whitelist_lower_df, on = "lower", how = "left")

del tags_replacement["lower"]

# drop correct tags (no need to replace!)
tags_replacement = tags_replacement[tags_replacement.Tag != tags_replacement.Replacement]

print(tags_replacement)  

                                                   Tag     Replacement
0                                              Kittler             NaN
1                                                 Rosa             NaN
2                                                Keane             NaN
3                                              Gauchet             NaN
4                                          Rosanvallon             NaN
5                                   Fundamental Rights             NaN
6                                      user experience             NaN
7                                    content diversity             NaN
8                                    longitudinal data             NaN
9                                          Filterblase             NaN
10                                         Grundrechte             NaN
11                                          censorship             NaN
12                                      intermediaries  Intermediaries
13    

In [19]:
# 2) replacement of synonyms [do not run/rerun without running upper/lower replacement first]

synonyms = df[["Keyword","Synonyme"]]
synonyms = synonyms[~synonyms.Keyword.str.contains("Generic")].reset_index(drop=True)

synonyms = (synonyms.drop("Synonyme", axis=1)
             .join
             (
             synonyms.Synonyme
             .str
             .split(",",expand=True)
             .stack()    
             .reset_index(drop=True, level=1)
             .rename("Synonyme")           
             ))

synonyms = synonyms[synonyms["Synonyme"] != " "]

synonyms["Synonyme"] = synonyms["Synonyme"].str.strip()

    # acount for case irregularities
synonyms_lower = {"Replacement":synonyms.Keyword, "lower":synonyms.Synonyme.str.lower()}
synonyms_lower_df = pd.DataFrame(synonyms_lower)

    # expand tags_replacement 
tags_lower_2 = {"Tag":tags_replacement["Tag"], 
                "lower":tags_replacement["Tag"].str.lower(), 
                "Replacement":tags_replacement["Replacement"]}
tags_lower_2_df = pd.DataFrame(tags_lower_2)

tags_replacement = pd.merge(tags_lower_2_df, synonyms_lower_df, on = "lower", how = "left")

tags_replacement.Replacement_x = tags_replacement.Replacement_x.combine_first(tags_replacement.Replacement_y)

del tags_replacement["lower"]

del tags_replacement["Replacement_y"]

tags_replacement.rename(columns = {"Replacement_x":"Replacement"}, inplace=True)

print(tags_replacement)  

                                                   Tag     Replacement
0                                              Kittler             NaN
1                                                 Rosa             NaN
2                                                Keane             NaN
3                                              Gauchet             NaN
4                                          Rosanvallon             NaN
5                                   Fundamental Rights    Human Rights
6                                      user experience             NaN
7                                    content diversity             NaN
8                                    longitudinal data             NaN
9                                          Filterblase             NaN
10                                         Grundrechte    Human Rights
11                                          censorship             NaN
12                                      intermediaries  Intermediaries
13    

In [36]:
# 3) auto-check for synonyms within Tags
            # [greediest method, may overwrite replacements already in place]

def foo(tag):
    for keyword in whitelist:
        if keyword.lower() in tag.lower():
            return keyword

tags_replacement["Replacement"].loc[pd.isnull(tags_replacement["Replacement"])] = tags_replacement["Tag"].apply(foo)
print(tags_replacement)




                                                   Tag     Replacement
0                                              Kittler            None
1                                                 Rosa            None
2                                                Keane            None
3                                              Gauchet            None
4                                          Rosanvallon            None
5                                   Fundamental Rights    Human Rights
6                                      user experience            None
7                                    content diversity            None
8                                    longitudinal data            None
9                                          Filterblase            None
10                                         Grundrechte    Human Rights
11                                          censorship            None
12                                      intermediaries  Intermediaries
13    

In [50]:
# 4) export tags_replacement for hand-checking 

tags_replacement.to_excel("tags_replacement.xlsx")

In [52]:
# 5) re-import hand-checked list

tags_replacement_edit = pd.read_excel("tags_replacement.xlsx")

# drop generica (authors, theories, technologies, policy fields stay as they are)
tags_replacement_edit = tags_replacement_edit[(tags_replacement_edit.Replacement != "author") &
                                             (tags_replacement_edit.Replacement != "central_actor") &
                                             (tags_replacement_edit.Replacement != "concrete_tech") &
                                             (tags_replacement_edit.Replacement != "method")&
                                              (tags_replacement_edit.Replacement != "policy_field") &
                                             (tags_replacement_edit.Replacement != "theory/school")].reset_index(drop=True)

# strip potential whitespace from replacements, just in case
tags_replacement_edit["Replacement"] = tags_replacement_edit["Replacement"].str.strip()

print(tags_replacement_edit)

                                                  Tag            Replacement
0                                  Fundamental Rights           Human Rights
1                                     user experience                    NaN
2                                   content diversity                    NaN
3                                   longitudinal data           Panel Survey
4                                         Filterblase          Fragmentation
5                                         Grundrechte           Human Rights
6                                          censorship     Power & Domination
7                                      intermediaries         Intermediaries
8                                   stereotype threat                    NaN
9                                          censorware                    NaN
10                                            twitter                Twitter
11                                      co-regulation             Regulation

In [None]:

### Loop through entries of library and replace Tags ###

In [53]:
# update zot - just in case
zot = zotero.Zotero(library_id, library_type, api_key)        

# replace tags
replace = tags_replacement_edit.loc[pd.notnull(tags_replacement_edit["Replacement"])]
for index, row in replace.iterrows():
#for index, row in test.iterrows(): 
    # get item with Tag
    items = zot.everything(zot.items(tag = row["Tag"]))
    # modify tags for items
    for item in items:      # item modification chunked in single API calls, not affected by 50 items-per-call limit
        print("Replace: %s -> %s" % (row["Tag"], row["Replacement"]))
        zot.add_tags(item, row["Replacement"])
    time.sleep(2)       # short rest between tags to minimize API errors

Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Interme

Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Interme

Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Interme

Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Intermediaries
Replace: intermediaries -> Interme

In [54]:
# delete old Tags (including the ones with replacement)

delete = tags_replacement_edit["Tag"].tolist()

# loop to keep track which ones get deleted without replacement
for index, row in tags_replacement_edit.iterrows():
    if pd.isnull(row["Replacement"]):
        print("Delete: %s" % row["Tag"])


# split deletion of tags into chunks of 50 to adhere to the 50-tags-deleted-per-call API limit                
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
for i in chunker(delete, 50):
    zot.delete_tags(*i)     

Delete: user experience
Delete: content diversity
Delete: stereotype threat
Delete: censorware
Delete: Reports ; Internet ; Mass Phenomena; Communication ; Article;
Delete: freedom of speech
Delete: Computer Science - Social and Information Networks
Delete: accountability
Delete: ideology
Delete: search
Delete: bing
Delete: Social sciences in mass media.
Delete: Social sciences.
Delete: Political Science / Censorship
Delete: Belief Gap
Delete: Political Science / Political Ideologies / Democracy
Delete: Political Science / Political Process / Media & Internet
Delete: Communication.
Delete: Misperceptions
Delete: Political Science / Public Policy / General
Delete: Information Deficit
Delete: Media Sociology.
Delete: Popular Science.
Delete: Popular Social Sciences.
Delete: Popular works.
Delete: information cocoon
Delete: self-selected
Delete: Democracy
Delete: Political Discourse
Delete: Hypertext
Delete: Selective Exposure
Delete: Design
Delete: Selective exposure
Delete: manipulation

In [None]:

### Check Back ###

In [59]:
        
tags_df = zot.everything(zot.tags())

tags_df = pd.DataFrame(pd.unique(tags_df))  # account for duplicates due to zotero bugs with unique()

tags_df.rename(columns = {0:'Tag'}, inplace=True)


# not in whitelist

sharing_link = "https://docs.google.com/spreadsheets/d/18n09G6fduidnWz0wzP1nWqMbXJW-AFkhL-kie-zZIYM/edit#gid=0"
export_link = sharing_link.replace('/edit#gid=', '/export?format=csv&gid=')
r = requests.get(export_link)
data = r.content

df = pd.read_csv(BytesIO(data))
    
whitelist = df["Keyword"].str.strip()
whitelist = whitelist[~whitelist.str.contains("Generic")].reset_index(drop=True)

tags_df = tags_df[~tags_df.Tag.isin(whitelist)]


# not in generica list

generica = pd.read_csv("Digidem_Generica_List_20190612.csv", names = "g", header = None)

tags_df = tags_df[~tags_df.Tag.isin(generica.g)].reset_index(drop=True)

print(tags_df)

           Tag
0      Kittler
1         Rosa
2        Keane
3      Gauchet
4  Rosanvallon
5       Butler


In [None]:

### Add new Generica ###

In [62]:
new_generica = pd.read_excel("tags_replacement.xlsx")

new_generica["Replacement"] = new_generica["Replacement"].str.strip()

# get generica (authors, theories, technologies, policy fields stay as they are)
new_generica = new_generica[(new_generica.Replacement == "author") |
                                             (new_generica.Replacement == "central_actor") |
                                             (new_generica.Replacement == "concrete_tech") |
                                             (new_generica.Replacement == "method")|
                                              (new_generica.Replacement == "policy_field") |
                                             (new_generica.Replacement == "theory/school")].reset_index(drop=True)

print(new_generica["Tag"])

0        Kittler
1           Rosa
2          Keane
3        Gauchet
4    Rosanvallon
5         Butler
Name: Tag, dtype: object


In [77]:
generica_update = generica["g"].append(new_generica["Tag"], ignore_index = True)
print(generica_update)



0                              Adorno
1                              Anders
2                           Appelbaum
3                              Arendt
4                               Bacon
5                               Barad
6                                Beck
7                            Benhabib
8                            Benjamin
9                             Bentham
10                             Berger
11                          Boltanski
12                           Bourdieu
13                              Buber
14                              Cohen
15                            Coleman
16                           Dahlgren
17                         Dahrendorf
18                             Debord
19                            Deleuze
20                            Derrida
21                              Dewey
22                            Drexler
23                            Dworkin
24                              Elias
25                           Foucault
26          

In [78]:
date = time.strftime("%Y%m%d")

generica_update.to_csv("Digidem_Generica_List_" + date + ".csv")