# "Progressiveness" metric
Based on document distance between treaties per year


## Imports

In [5]:
# Must be cleaned
import re
import os
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import random
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords # Import the stop word list
import gensim
import gensim.downloader as api
from gensim.parsing.preprocessing import strip_short
from pyemd import emd
import numpy as np
from collections import Counter
import pickle

## Preprocessing

In [14]:
fileObject = open('../Visualisations/Stopwords_law.pkl','rb')  
stopwords = pickle.load(fileObject)

In [20]:
#Function for preprocessing text
def txt_cleaner(text):
    meaningful_words = []
    text = text.lower() #make all lowercase
    #text = strip_short(text, minsize=4) #remove short words
    tokens = word_tokenize(text) # returns list of words
    tokens = [w for w in tokens if w.isalpha()] #remove punctuation, also numbers
    stops = [stopwords][0]
    tokens = [w for w in tokens if not w in stops] # remove stop words
    for item in tokens: #filter short and long words
        if len(item) >= 3 and len(item) < 30:
            meaningful_words.append(item)
    #count = Counter(meaningful_words) # Count most recurrent words
    #most_occur = count.most_common(10) # Make list with n most recurrent
    #most_occur = [item[0] for item in most_occur] # Get rid of the counter number
    #meaningful_words = [w for w in meaningful_words if not w in most_occur] # remove most recurrent
    return meaningful_words

In [21]:
# Search directory
directory_in_str = "../xml/"
directory = os.fsencode(directory_in_str)

In [22]:
# Here, the text for each treaty will be in one single list

# Generate base data frame
# The 'text' column will contain list with sublists, where every sublist will contain the tokenized text of every chapter
print("Preprocessing files to dataframe..")
text_df = pd.DataFrame(columns=['filename', 'id','type', 'year_signed', 'month', 'day', 'text'])
for n,file in enumerate(os.listdir(directory)):
    new_row = []
    filename = os.fsdecode(file)
    if filename.endswith('.XML') or filename.endswith('.xml'):
        new_row.append(filename)
        tree = ET.parse(directory_in_str + filename)
        root = tree.getroot()
        meta = root[0]
        body = root[1]
        if meta.find('language').text == 'en': 
            new_row.append(meta.find('treaty_identifier').text)
            new_row.append(meta.find('type').text)
            new_row.append(int(meta.find('date_signed').text.split('-')[0]))
            new_row.append(int(meta.find('date_signed').text.split('-')[1]))
            new_row.append(int(meta.find('date_signed').text.split('-')[2]))
            text_raw = ""
            for chapter in body.iter(): 
                text_raw += chapter.text
            text_clean = txt_cleaner(text_raw)
            new_row.append(text_clean)
            text_df.loc[n] = new_row
text_df.head(5)

Preprocessing files to dataframe..


Unnamed: 0,filename,id,type,year_signed,month,day,text
0,pta_218.xml,218,Free Trade Agreement,1975,4,28,"[denmark, ireland, great, britain, northern, i..."
1,pta_230.xml,230,Free Trade Agreement,1972,7,22,"[portuguese, desiring, consolidate, enlargemen..."
2,pta_224.xml,224,Free Trade Agreement,1977,5,3,"[lebanon, overall, contributing, lebanon, help..."
3,pta_378.xml,378,Partial Scope Agreement,2007,7,30,"[mauritius, islamic, pakistan, islamic, pakist..."
4,pta_344.xml,344,Free Trade Agreement,2009,7,14,"[chile, chile, hereinafter, chile, desirous, f..."


In [23]:
# Convert 'date_signed' column to datetime object
#text_df['date_signed'] = pd.to_datetime(text_df.date_signed)

# Sort items by date
text_df.sort_values(by='year_signed', inplace=True)

# Set 'id' as index (verify integrity in case of duplicates)
text_df = text_df.set_index(['id'],verify_integrity=True)
text_df.head(5)

Unnamed: 0_level_0,filename,type,year_signed,month,day,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
310,pta_310.xml,Customs Union,1948,12,6,"[informationof, south, africa, govern, ment, s..."
254,pta_254.xml,Free Trade Agreement,1951,3,9,"[republics, nicaragua, salvador, nicaragua, sa..."
110,pta_110.xml,Customs Union & Economic Integration Agreement,1957,3,25,"[consolidated, version, majesty, king, belgian..."
188,pta_188.xml,Free Trade Agreement,1958,6,10,"[text, xxviiiof, freetrade, intigration, afric..."
255,pta_255.xml,Free Trade Agreement,1959,6,23,"[congo, gabon, chad, merchandise, equatorial, ..."


## Selection
Here we define the selection of treaties we want to compare against each other

In [307]:
# Get list of unique years
year_unique = text_df.year_signed.unique()
# Get list of treaty types
type_unique = text_df.type.unique()

# Separate treaties by decade
decades_slices =[[0,5],[5,13],[13,21],[21,28],[28,37],[37,47],[47,54]]

# List of dataframes to iterate over
decades_df = []
types_df = []

# Append year slices to list of dataframes
for item in decades_slices:
    decades_df.append(text_df[text_df.year_signed.isin(year_unique[item[0]:item[1]])])

for item in type_unique:
    types_df.append(text_df[text_df.type.isin([item])])


In [312]:
types_df[4]

Unnamed: 0_level_0,filename,type,year_signed,month,day,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
136,pta_136.xml,Partial Scope Agreement,1967,12,23,"[india, arab, socialist, federal, yugoslavia, ..."
133,pta_133.xml,Partial Scope Agreement,1971,12,8,"[auspices, lxxj, geneva, authentic, texts, fre..."
127,pta_127.xml,Partial Scope Agreement,1975,7,31,"[asia, pacific, bangkok, recognizing, urgent, ..."
124,pta_124.xml,Partial Scope Agreement,1980,7,14,"[theegovernments, australia, cook, islands, fi..."
132,pta_132.xml,Partial Scope Agreement,1988,4,13,"[recognizing, key, element, strategy, collecti..."
120,pta_120.xml,Partial Scope Agreement,1991,6,20,"[thailand, lao, people, democratic, thailand, ..."
89,pta_89.xml,Partial Scope Agreement,1993,7,22,"[melanesian, spearhead, papua, guinea, solomon..."
113,pta_113.xml,Partial Scope Agreement,1993,4,11,"[saarc, sapta, people, bangladesh, bhutan, ind..."
117,pta_117.xml,Partial Scope Agreement,2003,7,17,"[eco, ecota, hereinafter, eco, islamic, afghan..."
435,pta_435.xml,Partial Scope Agreement,2003,10,9,"[india, thailand, india, thailand, hereinafter..."


## Distance calculations
+ Reduce amount of words(by substracting the most common words)
+ Convert text from list of list of words to list of words

In [20]:
# Import pretrained model
# model = gensim.models.KeyedVectors.load_word2vec_format('../../../GoogleNews-vectors-negative300.bin.gz', binary=True)
# Notice this vectors are already normalized

In [26]:
# Import pretrained model
# Had to switch to smaller model
model = api.load('glove-wiki-gigaword-300')

By time slices

In [27]:
dist_df = [] #this is where we will store all our distance matrices
for dfx in list_df:
    print(f'Processing from: {dfx.year_signed.max()} to {dfx.year_signed.min()}')
    labels = list(dfx.index.values) #labels for dataframe
    labels = ['id'] + labels #append 'id'
    distances = []
    for x in dfx.iterrows():
        print(f'Row: {x[0]}')
        dist_tmp = [] #temporal list for current row
        t1 = [words for sublist in x[1][5] for words in sublist]
        dist_tmp.append(x[0]) #append index
        for y in dfx.iterrows():
            t2 = [words for sublist in y[1][5] for words in sublist]
            d = model.wmdistance(t1, t2)
            dist_tmp.append(d)
        distances.append(dist_tmp) #once done with row, append to total distances
    df = pd.DataFrame.from_records(distances, columns=labels, index=['id']) #once all distances for current df are coll
    dist_df.append(df)

Processing from: 1961 to 1948
Row: 310
Row: 254
Row: 110
Row: 188
Row: 255
Row: 138
Row: 275
Row: 266
Row: 175
Row: 219
Processing from: 1970 to 1962
Row: 164
Row: 242
Row: 316
Row: 172
Row: 274
Row: 174
Row: 187
Row: 136
Row: 173
Row: 227
Row: 240
Row: 135
Row: 317
Row: 241
Row: 222
Row: 134
Row: 237
Row: 226
Processing from: 1979 to 1971
Row: 133
Row: 131
Row: 130
Row: 238
Row: 207
Row: 230
Row: 213
Row: 210
Row: 128
Row: 129
Row: 217
Row: 243
Row: 127
Row: 318
Row: 218
Row: 126
Row: 206
Row: 125
Row: 224
Row: 214
Row: 239
Row: 253
Row: 319
Processing from: 1988 to 1980
Row: 124
Row: 267
Row: 122
Row: 320
Row: 121
Row: 116
Row: 132
Row: 186
Processing from: 1992 to 1992
Row: 420
Row: 431
Row: 280
Row: 422
Row: 108
Row: 281
Row: 423
Row: 354
Row: 43
Row: 94
Row: 251
Row: 432
Row: 105
Row: 279
Row: 259
Row: 276
Row: 262
Row: 424
Row: 114
Row: 115
Row: 263
Row: 277
Row: 278
Row: 203
Row: 260
Row: 258
Row: 264
Row: 112
Row: 106
Row: 250
Row: 189
Row: 245
Processing from: 1993 to 1993
Row

Distance measured between all treaties

In [143]:
total_dist_df = [] #this is where we will store all our distance matrices

#print(f'Processing from: {dfx.year_signed.max()} to {dfx.year_signed.min()}')
labels = list(text_df.index.values) #labels for dataframe
labels = ['id'] + labels #append 'id'
distances = []
for x in text_df.iterrows():
    print(f'Row: {x[0]}')
    dist_tmp = [] #temporal list for current row
    t1 = [words for sublist in x[1][5] for words in sublist]
    dist_tmp.append(x[0]) #append index
    for y in text_df.iterrows():
        t2 = [words for sublist in y[1][5] for words in sublist]
        d = model.wmdistance(t1, t2)
        dist_tmp.append(d)
    distances.append(dist_tmp) #once done with row, append to total distances
total_dist_df = pd.DataFrame.from_records(distances, columns=labels, index=['id']) #once all distances for current df are coll

Row: 310
Row: 254
Row: 110
Row: 188
Row: 255
Row: 138
Row: 275
Row: 266
Row: 175
Row: 219
Row: 164
Row: 242
Row: 316
Row: 172
Row: 274
Row: 174
Row: 187
Row: 136
Row: 173
Row: 227
Row: 240
Row: 135
Row: 317
Row: 241
Row: 222
Row: 134
Row: 237
Row: 226
Row: 133
Row: 131
Row: 130
Row: 238
Row: 207
Row: 230
Row: 213
Row: 210
Row: 128
Row: 129
Row: 217
Row: 243
Row: 127
Row: 318
Row: 218
Row: 126
Row: 206
Row: 125
Row: 224
Row: 214
Row: 239
Row: 253
Row: 319
Row: 124
Row: 267
Row: 122
Row: 320
Row: 121
Row: 116
Row: 132
Row: 186
Row: 229
Row: 211
Row: 118
Row: 220
Row: 100
Row: 216
Row: 228
Row: 119
Row: 322
Row: 120
Row: 221
Row: 420
Row: 431
Row: 280
Row: 422
Row: 108
Row: 281
Row: 423
Row: 354
Row: 43
Row: 94
Row: 251
Row: 432
Row: 105
Row: 279
Row: 259
Row: 276
Row: 262
Row: 424
Row: 114
Row: 115
Row: 263
Row: 277
Row: 278
Row: 203
Row: 260
Row: 258
Row: 264
Row: 112
Row: 106
Row: 250
Row: 189
Row: 245
Row: 244
Row: 208
Row: 247
Row: 234
Row: 421
Row: 232
Row: 209
Row: 233
Row: 212
Row

In [147]:
#Save resulting dataframe to csv file
#out_dir = "saved_csv/glove-wiki-gigaword-300/total_ollie_stopwords/"

#csv_out = "total_dist_df.csv"
#total_dist_df.to_csv(out_dir + csv_out, index = True, header=True)

## Graphs

Force directed
+ Based on this: https://stackoverflow.com/questions/13513455/drawing-a-graph-or-a-network-from-a-distance-matrix
+ Uses NEATO for finding the layout of nodes https://www.graphviz.org/pdf/neatoguide.pdf

In [182]:
import json
import networkx as nx
import string
import pygraphviz
from networkx.readwrite import json_graph
from networkx.drawing.nx_pydot import write_dot

In [151]:
# Visualization directory output
out_dir_vis = "vis/glove-wiki-gigaword-300/total/"
file_out = "total_1.png"

In [218]:
dt = [('len', float)]
A = total_dist_df.values*40
A = A.view(dt)

G = nx.from_numpy_matrix(A)
G = nx.relabel_nodes(G, dict(zip(range(len(G.nodes())),list(dist_df[1].columns.values))))  #Rename node to treaty id
G = nx.drawing.nx_agraph.to_agraph(G) # Create pygraphviz

G.node_attr.update(fillcolor="transparent", shape="circle", style="filled", width="0.5")
G.edge_attr.update(color="transparent", width="2.0", len="10")
G.graph_attr.update(size="10",dpi='300')
G.graph_attr['label']='Name of graph'

G.draw(out_dir_vis + file_out, format='png', prog='neato')

In [205]:
import matplotlib.pyplot as plt
import networkx as nx
import string
import pygraphviz
from networkx.readwrite import json_graph
import numpy as np
from scipy.spatial import Voronoi, voronoi_plot_2d
from networkx.drawing.nx_pydot import write_dot

In [195]:
G.layout(prog = 'neato')
G.draw('file.png')

In [219]:
node=G.get_node('200')

In [222]:
node

'200'

In [200]:
s = s.split()

In [203]:
with open("Output.txt", "w") as text_file:
    print(s, file=text_file)