In [1]:
import csv
import html
import re
import unicodedata

import numpy as np
import pandas as pd
import plotly.express as px
from abydos.fingerprint import QGram, String
from gensim.parsing.preprocessing import remove_stopwords
from Levenshtein import distance


# Load dataset

<div class="alert alert-block alert-info">
<b>GHP:</b> 
I'm adding a column to the dataframe to help generate a concordance that we can then use during data migrations to transform the tags as necessary. The idea is for target to have one of three possible values:<br/>
<ul>
<li><b>None</b>: delete this tag</li>
<li><b>integer</b>: a database <b>id</b>, i.e. replace tag with supplied id</li>
<li><b>string</b>: the actual name of the tag, whether original or corrected</li>
</ul>
<br/>
</div>

In [19]:
# load data
dataset = pd.read_csv('islamiclawblog_tags_categories_convert.csv')
dataset = dataset.replace({np.nan: None})
# insert column to keep track of targets
dataset['target'] = None

dataset.head()


Unnamed: 0,id,name,slug,description,taxonomy,parent_id,parent,count,target
0,1605,AAOIFI,aaoifi,,category,1454,Islamic Finance,3,
1,1358,Academic freedom,academic-freedom,,category,0,,6,
2,1606,ACLU,aclu,,category,1587,U.S.,2,
3,1607,administrative state,administrative-state,,category,1566,State's role,9,
4,1359,Afghanistan,afghanistan,,category,0,,143,


# Fix HTML-escaped characters

<div class="alert alert-block alert-info">
<b>GHP:</b> I figured we might as well just get this out of the way (I don't think we want HTML encoding in tags anyway).</div>

In [20]:
sample = dataset[dataset['name'].str.contains('&a')].iloc[0:5]
display(sample)

for col in ['name', 'parent']:
    dataset[col] = [html.unescape(i) if i else i for i in dataset[col]]

print('\nRESULT:')
display(dataset.iloc[sample.index])


Unnamed: 0,id,name,slug,description,taxonomy,parent_id,parent,count,target
61,3900,Courts &amp; Canons,courts-and-canons,Courts&amp;Canons (CnC) is a suite of AI and o...,category,0,,12,
164,1456,Islamic Law &amp; Law of the Muslim World eJou...,islamic-law-law-of-the-muslim-world-ejournal,,category,0,,19,
403,4910,2022 Global Meeting on Law &amp; Society,2022-global-meeting-on-law-society,,post_tag,0,,1,
408,4512,32 B.C.J.L. &amp; Soc. Just. (2012),32-b-c-j-l-soc-just-2012,,post_tag,0,,1,
412,5291,7th Global Meeting on Law &amp; Society,7th-global-meeting-on-law-society,,post_tag,0,,1,



RESULT:


Unnamed: 0,id,name,slug,description,taxonomy,parent_id,parent,count,target
61,3900,Courts & Canons,courts-and-canons,Courts&amp;Canons (CnC) is a suite of AI and o...,category,0,,12,
164,1456,Islamic Law & Law of the Muslim World eJournal,islamic-law-law-of-the-muslim-world-ejournal,,category,0,,19,
403,4910,2022 Global Meeting on Law & Society,2022-global-meeting-on-law-society,,post_tag,0,,1,
408,4512,32 B.C.J.L. & Soc. Just. (2012),32-b-c-j-l-soc-just-2012,,post_tag,0,,1,
412,5291,7th Global Meeting on Law & Society,7th-global-meeting-on-law-society,,post_tag,0,,1,


# Find duplicates

In [21]:
tags = list(dataset['name'])
u_tags = list(set(tags))

print(f'Total tags: {len(tags)}')
print(f'Unique tags: {len(u_tags)}')
print(f'Duplicate tags: {len(tags) -len(u_tags)}\n')

dupes = [i for i in u_tags if tags.count(i) > 1]
dupes.sort()
print(', '.join(dupes))
print(f'\nUnique duplicates: {len(dupes)} (some may repeat more than once)')


Total tags: 6552
Unique tags: 6391
Duplicate tags: 161

AAOIFI, ACLU, Afghanistan, Africa, Australia, Bahrain, Bangladesh, Belgium, Brazil, Brunei, Canada, China, Courts & Canons, Denmark, Djibouti, Egypt, Establishment Clause, Ethiopia, Europe, European Union, Federal Shariat Court, Field Guide, Field Guide Roundup, First Amendment, France, Germany, Governance, Greece, Guardianship, ISIS, India, Indian Supreme Court, Indonesia, Iran, Iraq, Islamic Law Lexicon, Islamic Law Teaching, Islamic constitutionalism, Islamic criminal law, Islamic jurisprudence, Islamic law, Islamic legal authority, Islamic legal history, Islamic legal theory, Islamic marriage, Islamization, Italy, Japan, Jordan, Journal of Islamic Law, Kashmir, Kenya, Khizr Khan, Kuwait, Lebanon, Libya, Malaysia, Maldives, Mali, Morocco, Muhammad Ali, Muslim minorities, Māwardī, New Zealand, News, News mentions of Covid-19 and Islamic Law, Oman, Ottoman Empire, PIL News, Pakistan, Philippines, Prophet Muḥammad, Qarāfī, Qatar, 

In [22]:
# resolve duplicates by assigning first occurrence id to all others
for d in dupes:
    filtered = dataset.loc[dataset['name'] == d]
    id_list = list(filtered['id'])
    dataset.loc[filtered.index, 'target'] = min(id_list)

tags = list(dataset.loc[dataset['target'].isnull()]['name'])
u_tags = list(set(tags))

print(f'Total tags: {len(tags)}')
print(f'Unique tags: {len(u_tags)}')
print(f'Duplicate tags: {len(tags) -len(u_tags)}\n')

dataset.head()


Total tags: 6234
Unique tags: 6234
Duplicate tags: 0



Unnamed: 0,id,name,slug,description,taxonomy,parent_id,parent,count,target
0,1605,AAOIFI,aaoifi,,category,1454,Islamic Finance,3,1605.0
1,1358,Academic freedom,academic-freedom,,category,0,,6,
2,1606,ACLU,aclu,,category,1587,U.S.,2,1606.0
3,1607,administrative state,administrative-state,,category,1566,State's role,9,
4,1359,Afghanistan,afghanistan,,category,0,,143,1359.0


# Find occurences of commas, semicolon, slash, other delimiters

In [23]:
tags = list(dataset.loc[dataset['target'].isnull()]['name'])
tags_ugly_str = ''.join(tags)
print(''.join(set(re.findall(r'\W', tags_ugly_str))))

"'[῾–+-](̣’?,/: ̄&͑.​ )#=̲‘`̈


Tentatively, &, , ;, ?, _, +, and / may be of interest

In [24]:
poss_breakers = r'[,&;\?\+_/]'
poss_mult_tags = []

for t in tags:
    if(re.search(poss_breakers, t)):
        poss_mult_tags.append(t)

for e in poss_mult_tags:
    print (e)

Central Bank of Morocco/Bank al-Maghrib
Commanding right/forbidding wrong
gharar (indefiniteness / risk)
hadith/Ḥadīth
ḥijāb / headscarf debates
ḥudūd / hudood
ISIS/Islamic State
Islamic Law & Law of the Muslim World eJournal
Science, Technology, and the Law
talaq/ṭalāq
United Arab Emirates/UAE
2022 Global Meeting on Law & Society
32 B.C.J.L. & Soc. Just. (2012)
7th Global Meeting on Law & Society
9/11
Cases & Fatwās
Danial Latifi & Anr v. Union of India
Dubai Financial Services Authority/DFSA
Dubai International Finance Centre/DIFC
Flora & Noor
Ghuman & Safi v. State
Global Imams & Scholars Network
Javed & Ors. v. State of Haryana & Ors.
Journal of Church & State
K&L Gates LLP
Law & Society: International & Comparative Law eJournal
Legal Databases by Subject: Religion & Law
LGBTQI+ rights
LSEG Data & Analytics: Islamic market data
Middle East & Islamic Studies Databases for Research: Islamic Studies Databases
Program in Law & History
Program on Law & Society in the Muslim World
Pursui

Proposals:

- ~~Standardize `&amp;` to &~~
- ~~Standardize `&#039;` to '~~
- Delete `,` in all tags with it, possibly to be replaced by - or just a space
- For slashes:
  - When two forms (scientific / non-scientific transliteration, transliteration / TL, multiple TLs), prioritize scientific transliteration
  - For states: TBD based on most common
  - When one is an abbreviation, prioritize non-abbreviated form or split
  - Delete tag `void/non-marriage`
  - Keep all others

In the future, prohibit `,` in tags and encourage just using scientific transliterations, no TL or non-scientific ones in same tag

**GHP**:
I agree with the recommendations. Lots of rubbish here.

- `William & Marry` -> I'm guessing `William and Mary`
- `S&P 500 Shariah` -> Should these be separate tags?
- `Middle East & Islamic Studies Databases for Research: Islamic Studies Databases` -> Are these sentence-long tags even useful?



# Find repeats using string transforms

<div class="alert alert-block alert-info">
<b>GHP:</b>I expanded your approach here with a few more transformations that seemed useful to me, namely full-on string fingerprinting, stop word removal, and bi-gram fingerprinting. The results are grouped into "merge clusters", like you had them, but I added some code to abstract them at the level of ids to make it easier to apply the merges. I also pulled some stats to see how the different approaches perform just for fun.

PS: If you haven't encountered the [Abydos package](https://abydos.readthedocs.io/en/latest/abydos.distance.html) before, it's totally worth a look!</div>



In [8]:
sf = String()
qg = QGram()

def normalize(tag):
    """Return normalized tag (ascii encoded and lowercase)."""
    return unicodedata.normalize('NFKD', tag).encode('ascii', 'ignore').lower()

def fingerprint(tag):
    """Return string fingerprint of tag."""
    tag = sf.fingerprint(tag)
    return remove_stopwords(tag) # also remove stop words

def ngram(tag):
    """Return bigram fingerprint of tag."""
    return qg.fingerprint(tag)

# I wrote the code in the rest of this section to be very flexible
# so, if you think of any other transforms you want to try, just write a function to apply it, as above
# and add it to the list below. Everything else should just work!
transforms = [
    ('Normalize', normalize),
    ('Fingerprint', fingerprint),
    ('N-Gram', ngram),
]

def apply_transforms(tag_tuple):
    """Return tuple consisting of tag id, tag name, and transformed versions of tag as defined in `transforms`."""
    if not tag_tuple or not isinstance(tag_tuple, tuple):
        return [None] * (len(transforms) + 2)

    # we include the result AND the name of the transform in a tuple (the latter is for statistical purposes)
    return (*tag_tuple, *[(i[0], i[1](tag_tuple[1])) for i in transforms])


In [10]:
# get unprocessed tags as (id, name) tuples
tags = list(dataset.loc[dataset['target'].isnull()][['id', 'name']].itertuples(index=False, name=None))
pairs = [] # to store pairs of matched tags
clusters = [] # to store groups of tags matched to each other
methods = {} # to keep track of which methods match what for stats
transformed_tuples = [apply_transforms(i) for i in tags]

# compute basic statistics
for i, transform in enumerate(transforms):
    transformed_tags = [t[i+2] for t in transformed_tuples]
    u_transformed_tags = list(set(transformed_tags))
    print(f'{transform[0]} matched {len(transformed_tags)-len(u_transformed_tags)} tags.')

# generate matched pairs
for tag in transformed_tuples:
    test_list = [i for i in transformed_tuples if i[0] != tag[0]]  # remove current tag from contention
    for test in test_list:
        # if they are the same tag, then at least one of the transforms should match
        test_set = set(test[2:]).intersection(tag[2:])
        if test_set:
            # we store pairs of matched tags as sets because sets
            # are unordered, so we can test for their presence regardless of which tag comes first
            result = {tag[0], test[0]}
            for t in test_set:
                if result not in pairs:
                    pairs.append(result)
                if tuple(result) in methods:
                    methods[tuple(result)].append(t[0])
                else:
                    methods[tuple(result)] = [t[0]]

# cluster matched pairs
for p in pairs:
    for c in clusters:
        if c.intersection(p):
            c.update(p)
            break
    else:
        clusters.append(p)

print(f'Grouped {len(set().union(*clusters))} tags into {len(clusters)} merge clusters.')

# compile stats
stats = []
t_list = [i[0] for i in transforms]
for t in t_list:
    matching_pairs = [v for _, v in methods.items() if t in v]
    rest = t_list.copy()
    rest.remove(t)
    overlaps = len([i for i in matching_pairs if set(rest).intersection(i)])
    stats.append(['Overlaps', t, overlaps])
    stats.append(['Unique IDs', t, len(matching_pairs) - overlaps])

stats_df = pd.DataFrame(stats)
stats_df.columns =['Type', 'Transform', 'Count']

# show contribution of each transform
fig = px.bar(
    stats_df,
    x='Transform',
    y='Count',
    color='Type',
    title='Unique Contribution and Overlap per Transform',
)
fig.show()



Normalize matched 82 tags.
Fingerprint matched 170 tags.
N-Gram matched 97 tags.
Grouped 355 tags into 165 merge clusters.


In [11]:
# show proposed merge clusters
for c in clusters:
    tags = list(dataset.loc[dataset.id.isin(c)]['name'])
    print(f'{c}: {", ".join(tags)}')


{6462, 1358}: Academic freedom, academic freedom
{1362, 2607}: Anglo-Muhammadan Law, Anglo Muhammadan law
{1617, 7283}: Call for papers, Call for Papers
{1385, 1741}: Civil rights, Civil Rights
{1387, 6886}: Clerics, clerics
{2465, 1388}: Comparative law, comparative law
{6841, 2972, 1391}: Constitutional law, constitutional law, U.S. constitutional law
{1392, 1746}: constitutions, constitution
{1393, 7303}: Contract law, contract law
{5225, 1397}: Courts, courts
{2624, 1398}: Criminal law, criminal law
{1625, 2605, 5943}: darul-qaza, dar-ul-qaza system, Darul Qaza
{3409, 1626}: Data, data
{2728, 1404}: Digital Humanities, digital humanities
{4457, 1405}: digital Islamic humanities, Islamic digital humanities
{2929, 1406}: Digital Islamic law, Digital Islamic Law
{2816, 2817}: Digital Islamic Law Collection, digital islamic law collection
{2837, 2606}: Digital Islamic Law Lab, digital islamic law lab
{2452, 1629}: divorce, Divorce
{4276, 1413}: Employment law, employment law
{4433, 141

## Tentative Proposals:

- Use scientific transliteration (exact schema TBD, I recommend marking long vowels, emphatic h, s, t, d, z, q for ق, th for ث, sh for ش, mark 'ayn and final hamza but not initial) except for "Islam" and "Muhammad" standardize all forms to above
    - When multiple unicode renderings of same exist, standardize them all to NFC
    - Same for other loans like laïcité

- Standardize lower-case except for proper nouns, or just make all lower-case
- Can use above dict, method, or output to identify and collapse these

# Identifying possible repeats with Levenshtein distance

In [27]:
# Use normalized set to minimize other repeats
tags = list(dataset.loc[dataset['target'].isnull()]['name'])
normalized = [normalize(i) for i in tags]

doubles = {} #K is a tag, V is list of tags within one LD of it

i = 0
while i < len(normalized) - 1:
    j = i + 1
    while j < len(normalized):
        dist = distance(normalized[i], normalized[j])
        if dist <= 1:
            if(doubles.get(normalized[i])):
                doubles[normalized[i]].append(normalized[j])
            else:
                doubles[normalized[i]] = [normalized[j]]
            if(doubles.get(normalized[j])):
                doubles[normalized[j]].append(normalized[i])
            else:
                doubles[normalized[j]] = [normalized[i]]
        j += 1
    i+= 1

print(len(doubles.keys()))

for k, v in doubles.items():
    print(k)
    print(f"\t{', '.join([str(var) for var in v])}")

502
b'academic freedom'
	b'academic freedom', b'academic freedom'
b'anglo-muhammadan law'
	b'anglo muhammadan law'
b'anglo muhammadan law'
	b'anglo-muhammadan law'
b'call for papers'
	b'call for papers', b'call for papers'
b'civil rights'
	b'civil rights', b'civil rights'
b'clerics'
	b'clerics', b'clerics'
b'comparative law'
	b'comparative law', b'comparative law'
b'conferences'
	b'conference'
b'conference'
	b'conferences'
b'constitutional law'
	b'constitutional law', b'constitutional law'
b'constitutions'
	b'constitution'
b'constitution'
	b'constitutions'
b'contract law'
	b'contract law', b'contract law'
b'country profile'
	b'country profiles'
b'country profiles'
	b'country profile'
b'courts'
	b'courts', b'courts'
b'criminal law'
	b'criminal law', b'criminal law'
b'darul-qaza'
	b'darul qaza'
b'darul qaza'
	b'darul-qaza'
b'data'
	b'data', b'data', b'hata', b'hata'
b'hata'
	b'data', b'data'
b'digital humanities'
	b'digital humanities', b'digital humanities'
b'digital islamic law'
	b'dig

I'll write the above into a .CSV and manually decide on the rest.

In [None]:
with open('ILB Tags_Normalized_LD 1.tsv', 'w') as f:
    w = csv.writer(f, delimiter='\t')
    for k, v in doubles.items():
        w.writerow([str(k, 'utf-8')] + [str(a, 'utf-8') for a in v])

# Analyzing tags that differ by one word

We can also use "word-level" Lev distance to identify potential overlaps and typos. We'll repeat the above process for this and write a new .tsv.

We'll start with an adaptation of the SeqAlign method I've been developing.

In [None]:
def global_align_output(s, words1, words2, tolerance = 1, match = 8, mismatch = -8, ins = -5, delete = -5):

    alignments = []

    i = len(words1)
    j = len(words2)

    while i > 0 or j > 0:
    
        if i == 0 and j > 0:
            while j > 0:
                alignments += [("--", j-1, 'D')]
                j = j-1
            continue
        if j == 0 and i > 0:
            while i > 0:
                alignments += [(i-1, "--", 'D')]
                i = i - 1
            continue
        
        if distance(words2[j-1], words1[i-1]) <= tolerance:
            #print(f"Match found between{words2[j-1]} and {words1[i-1]}")
            score = match
        else:
            score = mismatch

        if score == match and (s[i][j] == s[i-1][j-1] + score):
            alignments += [(i-1, j-1, 'M')]
            i = i - 1
            j = j - 1
        
        elif s[i][j] == s[i-1][j] + ins:
            alignments += [(i-1, "--", 'D')]
            i = i - 1
        
        elif s[i][j] == s[i][j-1] + delete:
            alignments += [("--", j-1, 'D')]
            j = j - 1
        else:
            alignments += [(i-1, j-1, 'N')]
            i = i - 1
            j = j - 1
    alignments.reverse()
    return alignments

def seq_align_matrix_no_merge (words1, words2, tolerance = 1, match = 8, mismatch = -8, ins = -5, delete = -5):

    s = np.zeros((len(words1)+1, len(words2)+1))
    for j in range(1, len(words2)+1):
        s[0][j] = ins*j
    i = 1
    while i < len(words1)+1:
        s[i][0] = delete*i
        j = 1
        while j < len(words2)+1:

            cur_tolerance = tolerance
            if max(len(words2[j-1]), len(words1[i-1])) < cur_tolerance:
                cur_tolerance = 0

            cur_distance = distance(words2[j-1], words1[i-1])

            if cur_distance <= cur_tolerance:
                score = match
            else:
                score = mismatch

            s[i][j] = max((s[i-1][j-1] + score), (s[i-1][j] + delete), (s[i][j-1] + ins))
            j += 1

        #print(i)
        i += 1
    
    #print(alignments)

    #for a in alignments:
        #print(f'{words1[a[0]] if type(a[0]) != str else "--"} / {words2[a[1]] if type(a[1]) != str else "--"}')

    return (s)

def get_distance(output):
    diff_string = "".join([a[2] for a in output])
    diff_string = re.sub('M', "", diff_string)
    return(len(diff_string))


# Additional rules based on survey of Lev. distance stuff

* When two forms differ by only one or two letters because one is a plural, and the other a common noun, prioritize the singular
  * Exception: "rights"
* ta marbutaa should consistently be transliterated as -a, not -ah or -at, unless the other form is in very widespread usage in English
* Go with English versions of countries
* Avoid hyphenated English compound words when possible, but choose hyphens over spaces or underscores
* US Spelling unless CLEARLY used in the name of an organization itself
* Names should be how the namee is known to transliterate it, otherwise scientific transliteration
* No hashtags

Word-level distance led to more false equivalences but here are some rules I implemented:
* If both versions of a name are in the DB, prioritize the one with a middle initial
* Parenthetical abbreviations should not be included. Parenthetical disambiguations should be included only when absolutely necessary

# Visualize tag hierarchy

<div class="alert alert-block alert-info">
<b>GHP:</b> I can't remember whether we need to worry about the tag nesting/hierarchy or not, but just in case, I threw together a quick treemap chart to help us visualize it (it's dynamic, you can click on things).</div>

In [None]:
nested = dataset.loc[dataset['parent'].isnull() == False][['parent', 'name']]  # noqa: E712

fig = px.treemap(nested, path=[px.Constant('all'), 'parent', 'name'])
fig.update_traces(root_color='lightgrey')
fig.update_layout(margin = {'t': 50, 'l': 25, 'r': 25, 'b': 25})
fig.show()
