## Calculate Token Similarity for Bloom and XGLM Tokenizers

In [23]:
# Bloom tokenizer
from transformers import BloomTokenizerFast
import pandas as pd
from multiprocessing import Pool
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the tokenizer
word_tokenize = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m', TOKENIZERS_PARALLELISM=True)

def jaccard_similarity(args):
    text1, text2 = args
    # Tokenize the texts
    tokens1 = set(word_tokenize(text1.lower())['input_ids'])
    tokens2 = set(word_tokenize(text2.lower())['input_ids'])

    # Calculate Jaccard similarity
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    similarity = intersection / union if union != 0 else 0

    return similarity

df = pd.read_excel("SIB-200 languages.xlsx")

# Read the data for all languages
languages = df['Folder Name']
data = {}

for lang in languages:
    df = pd.read_csv(f'datasets/sib-200/data/annotated/{lang}/train.tsv', sep='\t')
    data[lang] = " ".join(df.text)

# Prepare arguments for parallel processing
args_list = [(data[lang1], data[lang2]) for lang1 in languages for lang2 in languages]

# Use multiple CPU cores for computation
with Pool() as pool:
    similarities = pool.map(jaccard_similarity, args_list)

# Reshape the results into a matrix
similarity_matrix = pd.DataFrame([similarities[i:i+len(languages)] for i in range(0, len(similarities), len(languages))],
                                 index=languages, columns=languages)

similarity_matrix.to_csv('jaccard_matrix_bloom.csv')

In [187]:
# XGLM tokenizer
from transformers import XGLMTokenizer
import pandas as pd
from multiprocessing import Pool
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the tokenizer
word_tokenize = XGLMTokenizer.from_pretrained('facebook/xglm-564M', TOKENIZERS_PARALLELISM=True)

def jaccard_similarity(args):
    text1, text2 = args
    # Tokenize the texts
    tokens1 = set(word_tokenize(text1.lower(), max_length=100000, truncation=True)['input_ids'])
    tokens2 = set(word_tokenize(text2.lower(), max_length=100000, truncation=True)['input_ids'])

    # Calculate Jaccard similarity
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    similarity = intersection / union if union != 0 else 0

    return similarity

df = pd.read_excel("SIB-200 languages.xlsx")

# Read the data for all languages
languages = df['Folder Name']
data = {}

for lang in languages:
    df = pd.read_csv(f'datasets/sib-200/data/annotated/{lang}/train.tsv', sep='\t')
    data[lang] = " ".join(df.text)

# Prepare arguments for parallel processing
args_list = [(data[lang1], data[lang2]) for lang1 in languages for lang2 in languages]

# Use multiple CPU cores for computation
with Pool() as pool:
    similarities = pool.map(jaccard_similarity, args_list)

# Reshape the results into a matrix
similarity_matrix = pd.DataFrame([similarities[i:i+len(languages)] for i in range(0, len(similarities), len(languages))],
                                 index=languages, columns=languages)

similarity_matrix.to_csv('jaccard_matrix_xglm.csv')

# Get information from Glottolog

In [54]:
from pyglottolog import Glottolog
import pandas as pd

df = pd.read_excel("SIB-200 languages.xlsx")
# add two columns latitude and longitude
df['latitude'] = ''
df['longitude'] = ''
df['countries'] = ''

glottolog = Glottolog('glottolog-v5.0')


# Fetch all language information
languages = glottolog.languoids()

# Find the language entry for Persian (ISO 639-3 code 'pes')
persian_info = None
for lang in languages:
    if lang.iso_code in df['ISO 639 - 3'].values:
        df.loc[df['ISO 639 - 3'] == lang.iso_code, 'latitude'] = lang.latitude
        df.loc[df['ISO 639 - 3'] == lang.iso_code, 'longitude'] = lang.longitude
        df.loc[df['ISO 639 - 3'] == lang.iso_code, 'countries'] = str([country.name for country in lang.countries])

df.to_excel("SIB-200 languages.xlsx")
df

Unnamed: 0,Folder Name,Language Name,ISO 639 - 1,ISO 639 - 2 (B),ISO 639 - 2 (T),ISO 639 - 3,Script (ISO 15924),Language Family,Population,Language Vitality,...,bloom-3b scbleu 2s,bloom-7b1 scbleu 2s,bloomz-560m scbleu 2s,bloomz-1b1 scbleu 2s,bloomz-1b7 scbleu 2s,bloomz-3b scbleu 2s,bloomz-7b1 scbleu 2s,latitude,longitude,countries
0,ace_Arab,Aceh,,ace,ace,ace,Arab,Austronesian,1 million to 1 billion,Endangered,...,0.7,1.1,0.2,0.3,0.4,0.2,1.1,3.90757,96.6032,['Indonesia']
1,ace_Latn,Aceh,,ace,ace,ace,Latn,Austronesian,1 million to 1 billion,Endangered,...,3.4,5.5,1.9,2.9,3.8,5.7,12.3,3.90757,96.6032,['Indonesia']
2,acm_Arab,Mesopotamian Spoken Arabic,,,,acm,Arab,Afro-Asiatic,1 million to 1 billion,Institutional,...,18.6,23.1,5.0,14.6,25.4,33.8,54.0,33.8178,43.1095,"['Iraq', 'Iran, Islamic Republic of', 'Jordan'..."
3,acq_Arab,"Arabic, Ta’izzi-Adeni Spoken",,,,acq,Arab,Afro-Asiatic,1 million to 1 billion,Institutional,...,22.6,26.1,6.7,17.6,32.3,40.8,52.7,13.6555,45.6411,"['Eritrea', 'Saudi Arabia', 'Yemen']"
4,aeb_Arab,Tunisian Spoken Arabic,,,,aeb,Arab,Afro-Asiatic,1 million to 1 billion,Institutional,...,15.2,20.5,4.0,13.0,22.5,28.4,44.0,33.7966,9.56156,"['Algeria', 'Libya', 'Tunisia']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,yue_Hant,Yue Chinese,,,,yue,Hant,Sino-Tibetan,1 million to 1 billion,Institutional,...,19.5,23.8,9.1,16.3,33.8,42.7,51.8,23.0,113.0,"['China', 'Viet Nam']"
200,zho_Hans,Chinese,zh,chi,zho,zho,Hans,Sino-Tibetan,1 billion plus,Institutional,...,18.3,23.0,0.4,16.6,29.3,41.7,50.9,,,
201,zho_Hant,Chinese,zh,chi,zho,zho,Hant,Sino-Tibetan,1 billion plus,Institutional,...,18.3,21.4,0.4,14.3,30.9,43.9,51.3,,,
202,zsm_Latn,Standard Malay,ms,,,zsm,Latn,Austronesian,,Institutional,...,28.2,33.0,6.6,20.5,33.4,49.5,61.4,3.08896,101.699,"['Brunei Darussalam', 'Malaysia', 'Singapore']"


# Calculate language geographical proximity

In [63]:
import pandas as pd
from multiprocessing import Pool
from geopy.distance import geodesic

df = pd.read_excel("SIB-200 languages.xlsx")
# remove rows with missing or nan values
df = df.dropna(subset=['latitude', 'longitude'])

# Calculate the geographical distance between languages
def distance(args):
    lang1, lang2 = args
    lat1, lon1 = lang1
    lat2, lon2 = lang2
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Prepare arguments for parallel processing
args_list = [(df.loc[df['Folder Name'] == lang1, ['latitude', 'longitude']].values[0],
              df.loc[df['Folder Name'] == lang2, ['latitude', 'longitude']].values[0])
             for lang1 in df['Folder Name'] for lang2 in df['Folder Name']]
# Use multiple CPU cores for computation
with Pool() as pool:
    distances = pool.map(distance, args_list)

# Reshape the results into a matrix
distance_matrix = pd.DataFrame([distances[i:i+len(df)] for i in range(0, len(distances), len(df))],
                                 index=df['Folder Name'], columns=df['Folder Name'])
distance_matrix.to_csv('distances/geographical_distance.csv')