In [1]:
import gensim
from gensim.models import Word2Vec
import os
import pickle
import nltk
from nltk.corpus import stopwords
import time

start = time.time()

print("Jarowinkler")
print("-----------")
print()

print("* Building index of documents...")

# List all documents in directory
path = "../inputdata/full_texts_all_cases/"

# Import stopwords           
stopwordsfile = "../script_resources/stopwords.pickle"
stopwords_full = []
with open(stopwordsfile, "rb") as f:
    tmp = pickle.load(f)
    stopwords_full.extend(list(tmp))
    stopwords_full.extend(stopwords.words('english'))
    
stopwords_full = list(set(stopwords_full))

# Only keep celex number from filename
def cleanfilename(name):
    result = ""
    result = name.replace("full_text_","")
    result = result.replace(".txt","")
    return result

def removeStopWords(text, stopwords_list):
    text = text.lower()
    for item in stopwords_list:
        text = text.replace(" " + item.lower() + " "," ")
        text = text.replace(" " + item.lower() + ","," ")
        text = text.replace(" " + item.lower() + "."," ")
        text = text.replace(" " + item.lower() + ";"," ")
    text = text.replace("+","")
    return text

# Import files and define mapping between case IDS and full texts   
files = []
index_to_celex = {}
index_to_value = {}
celex_to_value = {}
datafortraining = []
index = 0
for r, d, f in os.walk(path):
    for file in f:
        if '.txt' in file:
            files.append(os.path.join(r, file))
            celexnum = cleanfilename(os.path.basename(file))
            with open (path+file, "r", encoding="utf-8") as myfile:
                data = myfile.read().replace('\n', '')
                data = removeStopWords(data,stopwords_full)
                datafortraining.append(data)
                index_to_celex[index] = file
                index_to_value[index] = data
                celex_to_value[celexnum] = data
                index += 1

print(" Index successfully built!")
print()



Jarowinkler
-----------

* Building index of documents...
 Index successfully built!



In [2]:
# Import sample cases
import pandas as pd

# Fetch sample cases from file
def get_sample_cases(topic):
    data = pd.read_csv("../inputdata/sampled_cases.csv")
    relevant_rows = data[data['topic'] == topic]
    return relevant_rows['source'].tolist()

print("* Importing sample cases...")
# Celex numbers of reference cases
publichealth = get_sample_cases('public health')
socialpolicy = get_sample_cases('social policy')
dataprotection = get_sample_cases('data protection')
print(" Successfully imported sample cases!")
print()

* Importing sample cases...
 Successfully imported sample cases!



In [3]:
# Import citations
print("* Import citations for cases...")
citations = pd.read_csv('../inputdata/all_cases_citations.csv')
print(" Successfully imported citations!")

def find_cited_cases(celexnumber):
    global citations
    relevantsource = citations[citations['source'] == celexnumber]
    return relevantsource['target'].tolist()

def exists_citation_link_between(celexnumber1,celexnumber2):
    global citations
    relevantsource1 = citations[citations['source'] == celexnumber1]
    relevantsource2 = citations[citations['source'] == celexnumber2]
    if celexnumber2 in relevantsource1['target'].tolist() or celexnumber1 in relevantsource2['target'].tolist():
        return True
    return False

* Import citations for cases...
 Successfully imported citations!


In [4]:
unique_celex = []
unique_values = []
for k,v in celex_to_value.items():
    unique_celex.append(k)
    unique_values.append(v)


#unique_celex = list(set(unique_celex))

In [16]:
import numpy as np    
from scipy.spatial.distance import pdist, squareform
from similarity.jarowinkler import JaroWinkler
import operator
jarowinkler = JaroWinkler()

start = time.time()
transformed_docs = np.array(unique_values[0:20]).reshape(-1,1)
similarity_matrix = pd.DataFrame(
    squareform(pdist(transformed_docs,lambda x,y: jarowinkler.similarity(x[0], y[0]))),
    columns = unique_celex[0:20],
    index = unique_celex[0:20]
)
end = time.time()

print(end-start, "s")

548.935001373291 s


In [15]:
print(similarity_matrix)

             61954CJ0001  61954CJ0002  61954CJ0003  61954CJ0004  61954CJ0006  \
61954CJ0001     0.000000     0.785417     0.598043     0.609698     0.705911   
61954CJ0002     0.785417     0.000000     0.581678     0.592110     0.675077   
61954CJ0003     0.598043     0.581678     0.000000     0.808137     0.657010   
61954CJ0004     0.609698     0.592110     0.808137     0.000000     0.675019   
61954CJ0006     0.705911     0.675077     0.657010     0.675019     0.000000   
61954CJ0007     0.760119     0.809785     0.575422     0.584460     0.661373   
61954CJ0008     0.569639     0.557477     0.729361     0.705359     0.608871   
61954CO0007     0.535537     0.533981     0.605258     0.594394     0.556585   
61954CO0008     0.537957     0.533980     0.607488     0.597535     0.556175   
61955CJ0001     0.695598     0.662491     0.662850     0.682890     0.797733   

             61954CJ0007  61954CJ0008  61954CO0007  61954CO0008  61955CJ0001  
61954CJ0001     0.760119     0.569639   

In [6]:
#started 20:58
#end ?
results = []




def lookup_similar_cases(sample_cases, n, topic):
    global results
    global celex_to_value
    global jarowinkler
    count = 1
    num = len(sample_cases)
    for item in sample_cases:
        print(count,"/",num,item)
        count+=1
        current_dict = {}
        for k,v in celex_to_value.items():
            if k != item:
                current_sim_val = jarowinkler.similarity(celex_to_value[item], v)
                current_dict[k] = current_sim_val
                print((k,current_sim_val))
        sorted_dict = sorted(current_dict.items(), key=operator.itemgetter(1))
        topn = sorted_dict[-n:]
        for reference in topn:
            results.append([item,reference[0],reference[1],'jaro-winkler',exists_citation_link_between(item,reference[0]),topic])

print("* Computing similar cases...")
print("* Public health")
# 1. Public Health
lookup_similar_cases(publichealth,20,'public health')
print("* Social policy")
# 2. Social Policy
lookup_similar_cases(socialpolicy,20,'social policy')
# 3. Data Protection
print("* Data protection")
lookup_similar_cases(dataprotection,20,'data protection')

print(" Successfully computed similar cases!")
print()

* Computing similar cases...
* Public health
1 / 63 62003CJ0453
('61954CJ0001', 0.6430777342851712)
('61954CJ0002', 0.6717938837032017)
('61954CJ0003', 0.5449067250804195)
('61954CJ0004', 0.5497137992113789)
('61954CJ0006', 0.5890921975639446)
('61954CJ0007', 0.6847210403401421)
('61954CJ0008', 0.5322464658723729)
('61954CO0007', 0.5202898844878939)
('61954CO0008', 0.5201663423094777)
('61955CJ0001', 0.5848253168585886)
('61955CJ0005', 0.5553991136939262)
('61955CJ0008(01)', 0.6686321723188097)
('61955CJ0008', 0.5743178271498697)
('61955CJ0009', 0.6324760038215765)
('61955CJ0010', 0.5781535199188118)
('61956CJ0001', 0.5764151219918593)
('61956CJ0002', 0.6048380158325979)
('61956CJ0007', 0.684003900973894)
('61956CJ0008', 0.5475208488237416)
('61956CJ0009', 0.6900537299315248)
('61956CJ0010', 0.6703604580222593)
('61957CJ0001', 0.5623044157310709)
('61957CJ0002', 0.5656529197961095)
('61957CJ0008', 0.6703403394741333)
('61957CJ0009', 0.6592049008879245)
('61957CJ0010', 0.662623182010350

KeyboardInterrupt: 

In [None]:
print("* Writing results to file...")
import csv
import os.path

if os.path.exists('../outputdata/results_jaro.csv') == False:
    results.insert(0,['source_case','similar_case','similarity_score','method','citation_link','source_case_topic'])
    
with open('../outputdata/results_jaro.csv', 'a', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=',')
    writer.writerows(results)
    
end = time.time()

print(" Successfully wrote results to file!")
print()
print(" Done!")
print()
print("* Time taken:",(end-start),"s")