# "Progressiveness" metric
Based on document distance between treaties per year


## Imports

In [5]:
# Must be cleaned
import re
import os
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import random
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords # Import the stop word list
import gensim
import gensim.downloader as api
from gensim.parsing.preprocessing import strip_short
from pyemd import emd
import numpy as np
from collections import Counter
import pickle

## Preprocessing

In [14]:
fileObject = open('../Visualisations/Stopwords_law.pkl','rb')  
stopwords = pickle.load(fileObject)

In [20]:
#Function for preprocessing text
def txt_cleaner(text):
    meaningful_words = []
    text = text.lower() #make all lowercase
    #text = strip_short(text, minsize=4) #remove short words
    tokens = word_tokenize(text) # returns list of words
    tokens = [w for w in tokens if w.isalpha()] #remove punctuation, also numbers
    stops = [stopwords][0]
    tokens = [w for w in tokens if not w in stops] # remove stop words
    for item in tokens: #filter short and long words
        if len(item) >= 3 and len(item) < 30:
            meaningful_words.append(item)
    #count = Counter(meaningful_words) # Count most recurrent words
    #most_occur = count.most_common(10) # Make list with n most recurrent
    #most_occur = [item[0] for item in most_occur] # Get rid of the counter number
    #meaningful_words = [w for w in meaningful_words if not w in most_occur] # remove most recurrent
    return meaningful_words

In [21]:
# Search directory
directory_in_str = "../xml/"
directory = os.fsencode(directory_in_str)

In [22]:
# Generate base data frame
print("Preprocessing files to dataframe..")
text_df = pd.DataFrame(columns=['filename', 'id','type', 'year_signed', 'month', 'day', 'text'])
for n,file in enumerate(os.listdir(directory)):
    new_row = []
    filename = os.fsdecode(file)
    if filename.endswith('.XML') or filename.endswith('.xml'):
        new_row.append(filename)
        tree = ET.parse(directory_in_str + filename)
        root = tree.getroot()
        meta = root[0]
        body = root[1]
        if meta.find('language').text == 'en': 
            new_row.append(meta.find('treaty_identifier').text)
            new_row.append(meta.find('type').text)
            new_row.append(int(meta.find('date_signed').text.split('-')[0]))
            new_row.append(int(meta.find('date_signed').text.split('-')[1]))
            new_row.append(int(meta.find('date_signed').text.split('-')[2]))
            text_raw = ""
            for chapter in body.iter(): 
                text_raw += chapter.text
            text_clean = txt_cleaner(text_raw)
            new_row.append(text_clean)
            text_df.loc[n] = new_row
text_df.head(5)

Preprocessing files to dataframe..


Unnamed: 0,filename,id,type,year_signed,month,day,text
0,pta_218.xml,218,Free Trade Agreement,1975,4,28,"[denmark, ireland, great, britain, northern, i..."
1,pta_230.xml,230,Free Trade Agreement,1972,7,22,"[portuguese, desiring, consolidate, enlargemen..."
2,pta_224.xml,224,Free Trade Agreement,1977,5,3,"[lebanon, overall, contributing, lebanon, help..."
3,pta_378.xml,378,Partial Scope Agreement,2007,7,30,"[mauritius, islamic, pakistan, islamic, pakist..."
4,pta_344.xml,344,Free Trade Agreement,2009,7,14,"[chile, chile, hereinafter, chile, desirous, f..."


In [23]:
# Convert 'date_signed' column to datetime object
#text_df['date_signed'] = pd.to_datetime(text_df.date_signed)

# Sort items by date
text_df.sort_values(by='year_signed', inplace=True)

# Set 'id' as index (verify integrity in case of duplicates)
text_df = text_df.set_index(['id'],verify_integrity=True)
text_df.head(5)

Unnamed: 0_level_0,filename,type,year_signed,month,day,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
310,pta_310.xml,Customs Union,1948,12,6,"[informationof, south, africa, govern, ment, s..."
254,pta_254.xml,Free Trade Agreement,1951,3,9,"[republics, nicaragua, salvador, nicaragua, sa..."
110,pta_110.xml,Customs Union & Economic Integration Agreement,1957,3,25,"[consolidated, version, majesty, king, belgian..."
188,pta_188.xml,Free Trade Agreement,1958,6,10,"[text, xxviiiof, freetrade, intigration, afric..."
255,pta_255.xml,Free Trade Agreement,1959,6,23,"[congo, gabon, chad, merchandise, equatorial, ..."


In [356]:
#Save resulting dataframe to csv file
#out_dir = "saved_csv/glove-wiki-gigaword-300/"

#csv_out = "text_df.csv"
#text_df.to_csv(out_dir + csv_out, index = True, header=True)

## Selection
Here we define the selection of treaties we want to compare against each other

In [307]:
# Get list of unique years
year_unique = text_df.year_signed.unique()
# Get list of treaty types
type_unique = text_df.type.unique()

# Separate treaties by decade
decades_slices =[[0,5],[5,13],[13,21],[21,28],[28,37],[37,47],[47,54]]

# List of dataframes to iterate over
decades_df = []
types_df = []

# Append 'decades' slices to list of dataframes
for item in decades_slices:
    decades_df.append(text_df[text_df.year_signed.isin(year_unique[item[0]:item[1]])])

# Append treaties by type to list of dataframes
for item in type_unique:
    types_df.append(text_df[text_df.type.isin([item])])

In [354]:
text_df[text_df.type.isin([type_unique[2]])]

Unnamed: 0_level_0,filename,type,year_signed,month,day,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
110,pta_110.xml,Customs Union & Economic Integration Agreement,1957,3,25,"[consolidated, version, majesty, king, belgian..."
128,pta_128.xml,Customs Union & Economic Integration Agreement,1973,7,4,"[consolidate, bonds, historically, existed, pe..."
119,pta_119.xml,Customs Union & Economic Integration Agreement,1991,3,26,"[mercosur, southern, mercosur, argentine, fede..."
85,pta_85.xml,Customs Union & Economic Integration Agreement,1999,11,30,"[logo, east, east, august, whereas, uganda, ke..."
441,pta_441.xml,Customs Union & Economic Integration Agreement,2014,10,10,"[treatyon, armenia, eurasian, dated, minsk, oc..."
444,pta_444.xml,Customs Union & Economic Integration Agreement,2014,12,23,"[nazira, translationtreatyon, kyrgyz, republic..."
440,pta_440.xml,Customs Union & Economic Integration Agreement,2014,5,29,"[eurasian, belarus, kazakhstan, russian, feder..."


## Distance calculations
Compute distance matrices

In [26]:
# Import pretrained model
model = api.load('glove-wiki-gigaword-300')

### Measure distance between treaties in same decade

In [316]:
decades_dist = [] #this is where we will store all our distance matrices
for dfx in decades_df:
    print(f'Processing from: {dfx.year_signed.min()} to {dfx.year_signed.max()}')
    labels = list(dfx.index.values) #labels for dataframe
    labels = ['id'] + labels #append 'id'
    distances = []
    for x in dfx.iterrows():
        #print(f'Row: {x[0]}')
        dist_tmp = [] #temporal list for current row
        t1 = [words for sublist in x[1][5] for words in sublist]
        dist_tmp.append(x[0]) #append index
        for y in dfx.iterrows():
            t2 = [words for sublist in y[1][5] for words in sublist]
            d = model.wmdistance(t1, t2)
            dist_tmp.append(d)
        distances.append(dist_tmp) #once done with row, append to total distances
    df = pd.DataFrame.from_records(distances, columns=labels, index=['id']) #once all distances for current df are coll
    decades_dist.append(df)
decades_dist[0].head()

Processing from: 1948 to 1959
Processing from: 1960 to 1969
Processing from: 1970 to 1979
Processing from: 1980 to 1988
Processing from: 1991 to 1999
Processing from: 2000 to 2009
Processing from: 2010 to 2016


Unnamed: 0_level_0,310,254,110,188,255
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
310,0.0,0.524973,0.592487,0.631757,0.914906
254,0.524973,0.0,0.460972,0.362904,0.643523
110,0.592487,0.460972,0.0,0.467749,0.793777
188,0.631757,0.362904,0.467749,0.0,0.785606
255,0.914906,0.643523,0.793777,0.785606,0.0


In [321]:
#Save resulting dataframe to csv file
#out_dir = "saved_csv/glove-wiki-gigaword-300/decades_ollie_stopwords/"

#for n,item in enumerate(decades_dist):
#    csv_out = "decades_dist_"+str(n)+".csv"
#    item.to_csv(out_dir + csv_out, index = True, header=True)

### Measure distance between treaties of same type

In [323]:
types_dist = [] #this is where we will store all our distance matrices
for dfx in types_df:
    #print(f'Processing: {dfx.type}')
    labels = list(dfx.index.values) #labels for dataframe
    labels = ['id'] + labels #append 'id'
    distances = []
    for x in dfx.iterrows():
        #print(f'Row: {x[0]}')
        dist_tmp = [] #temporal list for current row
        t1 = [words for sublist in x[1][5] for words in sublist]
        dist_tmp.append(x[0]) #append index
        for y in dfx.iterrows():
            t2 = [words for sublist in y[1][5] for words in sublist]
            d = model.wmdistance(t1, t2)
            dist_tmp.append(d)
        distances.append(dist_tmp) #once done with row, append to total distances
    df = pd.DataFrame.from_records(distances, columns=labels, index=['id']) #once all distances for current df are coll
    types_dist.append(df)
types_dist[0].head()

Unnamed: 0_level_0,310,219,164,172,226,210,243,267,116,100,322,203,111,32,109,429,97,15,6,426
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
310,0.0,1.129512,0.639307,0.864257,0.696658,0.794314,0.713057,0.555742,0.711764,1.084049,0.87665,0.748875,0.540154,0.587584,0.73277,0.634566,0.509592,0.860832,0.614681,0.909509
219,1.129512,0.0,0.971891,1.319841,1.085745,0.758499,0.84179,0.900157,0.917413,0.966445,1.261078,0.928909,0.920413,1.016011,0.910685,1.027017,0.998739,1.054567,0.946026,1.456037
164,0.639307,0.971891,0.0,0.643035,0.454788,0.844805,0.769853,0.648314,0.628061,0.918376,0.792682,0.802896,0.477872,0.659373,0.559694,0.610609,0.621837,0.699382,0.398657,0.826577
172,0.864257,1.319841,0.643035,0.0,0.69416,0.985879,0.949693,0.735017,0.736948,0.911147,0.910222,0.985872,0.59494,0.639184,0.68379,0.622057,0.64937,0.596922,0.864461,0.726365
226,0.696658,1.085745,0.454788,0.69416,0.0,0.976802,0.759234,0.802511,0.775029,1.022828,0.809914,0.875487,0.540154,0.726289,0.670844,0.639222,0.742448,0.800142,0.631551,0.648353


In [324]:
#Save resulting dataframe to csv file
out_dir = "saved_csv/glove-wiki-gigaword-300/types/"

for n,item in enumerate(types_dist):
    csv_out = "types_dist_"+str(n)+".csv"
    item.to_csv(out_dir + csv_out, index = True, header=True)

### Measure distance between all treaties

In [143]:
total_dist_df = [] #this is where we will store all our distance matrices

#print(f'Processing from: {dfx.year_signed.max()} to {dfx.year_signed.min()}')
labels = list(text_df.index.values) #labels for dataframe
labels = ['id'] + labels #append 'id'
distances = []
for x in text_df.iterrows():
    print(f'Row: {x[0]}')
    dist_tmp = [] #temporal list for current row
    t1 = [words for sublist in x[1][5] for words in sublist]
    dist_tmp.append(x[0]) #append index
    for y in text_df.iterrows():
        t2 = [words for sublist in y[1][5] for words in sublist]
        d = model.wmdistance(t1, t2)
        dist_tmp.append(d)
    distances.append(dist_tmp) #once done with row, append to total distances
total_dist_df = pd.DataFrame.from_records(distances, columns=labels, index=['id']) #once all distances for current df are coll

Row: 310
Row: 254
Row: 110
Row: 188
Row: 255
Row: 138
Row: 275
Row: 266
Row: 175
Row: 219
Row: 164
Row: 242
Row: 316
Row: 172
Row: 274
Row: 174
Row: 187
Row: 136
Row: 173
Row: 227
Row: 240
Row: 135
Row: 317
Row: 241
Row: 222
Row: 134
Row: 237
Row: 226
Row: 133
Row: 131
Row: 130
Row: 238
Row: 207
Row: 230
Row: 213
Row: 210
Row: 128
Row: 129
Row: 217
Row: 243
Row: 127
Row: 318
Row: 218
Row: 126
Row: 206
Row: 125
Row: 224
Row: 214
Row: 239
Row: 253
Row: 319
Row: 124
Row: 267
Row: 122
Row: 320
Row: 121
Row: 116
Row: 132
Row: 186
Row: 229
Row: 211
Row: 118
Row: 220
Row: 100
Row: 216
Row: 228
Row: 119
Row: 322
Row: 120
Row: 221
Row: 420
Row: 431
Row: 280
Row: 422
Row: 108
Row: 281
Row: 423
Row: 354
Row: 43
Row: 94
Row: 251
Row: 432
Row: 105
Row: 279
Row: 259
Row: 276
Row: 262
Row: 424
Row: 114
Row: 115
Row: 263
Row: 277
Row: 278
Row: 203
Row: 260
Row: 258
Row: 264
Row: 112
Row: 106
Row: 250
Row: 189
Row: 245
Row: 244
Row: 208
Row: 247
Row: 234
Row: 421
Row: 232
Row: 209
Row: 233
Row: 212
Row

In [147]:
#Save resulting dataframe to csv file
#out_dir = "saved_csv/glove-wiki-gigaword-300/total_ollie_stopwords/"

#csv_out = "total_dist_df.csv"
#total_dist_df.to_csv(out_dir + csv_out, index = True, header=True)