## Document Distance

In [1]:
# Imports 
import os
import xml.etree.ElementTree as et
import pandas as pd
from itertools import chain

import numpy as np
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD as LSA
import random
random.seed(13)


#visualisation stuff
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline


# import mpld3
# mpld3.enable_notebook()

import spacy
from collections import Counter
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)
import networkx as nx

import gensim
import gensim.downloader as api
from gensim.parsing.preprocessing import strip_short

In [2]:
# Load data 
PATH = "../xml/"
extension = ".xml"

def load_order(PATH, extension):
    """
    Custom function for loading and sorting the xml files from the TOTA dataset.
    Returns an array of filenames sorted by the number in the file name.
    """
    files = []
    for filename in os.listdir(PATH):
        if filename == ".ipynb_checkpoints":
            continue
        files.append(filename.split(".")[0]) # remove .xml
    files.sort(key=lambda x:int(x.split("_")[1])) # sort files based on file number 

    files = [file + extension for file in files]
#     print(files)
    return files

files = load_order(PATH, extension)

In [33]:
def xml2list(file):
    """
    Custom function for parsing and outputing the data into arrays.
    Returns two array's the extracted column names and content for each column.
    """
    parsed_xml = et.parse(file)

    root = parsed_xml.getroot()

    meta_column_names = []
    meta_column_content = []

    for item in root[0]:
        grandchild = item.findall(".//")
        if grandchild:
            grandstore = []
            #print(item.tag)
            meta_column_names.append(item.tag)
            for x in grandchild:
                grandstore.append(int(x.attrib.get('n')))
                #print(x.tag +":", x.attrib)
            meta_column_content.append(grandstore)

        else:
            #print(item.tag +":", item.text)
            meta_column_names.append(item.tag)
            meta_column_content.append(item.text)
            
    return meta_column_names, meta_column_content
    
    
def xml2list_text(file):
    """
    Custom function for parsing and outputing the data into arrays.
    Returns two array's the extracted column names and content for each column.
    """

    parsed_xml = et.parse(file)
    root = parsed_xml.getroot()
    chapters = []
    article_store = []
    
    treaty_text = ''
    
    for item in root[1]:
        chapter_name = item.attrib.get('name')
#         print("chapter: " + item.attrib.get('name'))
        chapters.append(chapter_name)
        if chapter_name == None:
            chapter_name = ''
        treaty_text += chapter_name + " "
        
        articles = item.findall("article")
        for article in articles:
#             print(article.attrib.get('name'))
            art_no = article.attrib.get('number')
            art_text =  article.text
            article_store.append(art_no)
            if art_no == None:
                continue
#             print(art_no)
            treaty_text += art_no + " "
            treaty_text += art_text + " "
        
            
    return len(chapters), len(article_store), treaty_text

In [35]:
textdf = pd.DataFrame()

chapter_numbers = []
article_numbers = []
treaty_text = []

for i in range(0, len(files)):
    chapter_no, article_no, text = xml2list_text(PATH + files[i])
    chapter_numbers.append(chapter_no)
    article_numbers.append(article_no)
    tokens = tokenizer(text)
    treaty_text.append(tokens.text.lower())

textdf["chapter_number"] = chapter_numbers
textdf["article_number"] = article_numbers
textdf["treaty_text"] = treaty_text

In [36]:
with open("alltext.txt") as f:
    wordcount = Counter(f.read().split())

stop_words_law = []
for item, frequency in wordcount.most_common(2000):
    stop_words_law.append(item)
stop_words_law.append('\n')

In [37]:
dfcols, null = xml2list(PATH + files[3])
totadf = pd.DataFrame(columns=dfcols)

for i in range(0, len(files)):
    dfcolstmp, datatmp = xml2list(PATH + files[i])
    tmp_df = pd.DataFrame([datatmp], columns=dfcolstmp)
    tmp_df = tmp_df.loc[:,~tmp_df.columns.duplicated()] #very important! removes duplicate columns note that it removes the multiple "source" columnsns
    totadf = totadf.loc[:,~totadf.columns.duplicated()]#very important! removes duplicate columns note that it removes the multiple "source" columnsns 
    totadf = pd.concat([totadf,tmp_df], axis=0, sort=False)
totadf = totadf.reset_index(drop=True)
totadf = pd.concat([totadf, textdf], axis=1)

In [38]:
totadf['year_signed'] = totadf.date_signed.map( lambda x: pd.to_datetime(x).year)
totadf = totadf[totadf.language == 'en']
new = ['treaty_identifier', 'chapter_number', 'article_number', 'treaty_text', 'year_signed', 'type', 'region', 'name']

totadf_emb = totadf[new]
totadf_emb = totadf_emb.reset_index()
totadf_emb.head()

Unnamed: 0,index,treaty_identifier,chapter_number,article_number,treaty_text,year_signed,type,region,name
0,0,1,17,175,preamble general provisions article 1 the obje...,2007,Free Trade Agreement & Economic Integration Ag...,East Asia,Japan - Thailand
1,1,2,8,40,preamble article 1 the parties shall gradually...,2005,Free Trade Agreement,Africa; Europe,Egypt - Turkey
2,2,3,21,201,preamble general provisions article 1 the part...,2007,Free Trade Agreement & Economic Integration Ag...,South America; East Asia,Chile - Japan
3,3,4,3,6,"article 1 the republic of albania, bosnia an...",2006,Free Trade Agreement,Europe; Commonwealth of Independent States (CIS),Central European Free Trade Agreement (CEFTA) ...
4,4,5,11,52,preamble general provisions article 1 1. egypt...,2007,Free Trade Agreement,Africa; Europe,EFTA - Egypt


In [17]:
model = api.load("word2vec-google-news-300") 



In [39]:
# totadf_emb[totadf_emb['treaty_text'] == '']

In [40]:
distance = model.wmdistance(totadf_emb['treaty_text'][0], totadf_emb['treaty_text'][3])
distance

0.21195714679909775

In [None]:
For every treaty in dataframe compute distance to all other treaties 

for text in totadf_emb['treaty_text']:
    distance = []
    

In [41]:
t_d = []
for item in totadf_emb.iterrows():
    print(item[1][0])
    d = []
    for x,n in enumerate(range(0,len(item[1][4]))):
        t1 = item[1][4][0]
        t2 = item[1][4][n]
        distance = model.wmdistance(t1, t2)
        d.append(distance)
    t_d.append([d])

0
1
2
3
4
5
6
7
8
9
10
11
12


KeyboardInterrupt: 