In [1]:
#   Philip Tsvetanov
#   13.01.2021

In [2]:
import re
from lxml import etree
import os
cwd = os.getcwd()
# for me, the corpus folder in in the same location as this file
datasetPath = cwd + '\corpus\\fulltext\\'

In [4]:
# for each file in the folder: open the file and read its lines to a list
for filename in os.listdir(datasetPath):
    lines = []
    with open(datasetPath + '\\' + filename,'r') as file:
        lines = file.readlines()
                
    # then rewrite the lines that we need, and add any new ones
    with open(datasetPath + '\\' + filename,'w') as file:
        for line in lines :
            # <catchphrase "id=c2"> is invalid XML, I believe (should be id="c2"), so we will have to remove these:
            # XMLSyntaxError: error parsing attribute name, line 6, column 14
            if 'catchphrase' not in line:  
                file.write(line)
            # however, some files have french phrases like vis-à-vis (because of course they do), but this is problematic
            # seems like whoever exported the XMLs did not provide any way of our parser to read the french symbols, see:
            # https://www.ou.edu/research/electron/internet/special.shtml - in vis-à-vis we have a &agrave; that can not be read (file 06_60)
            # so we add a DTD, as explained here: https://github.com/PerseusDL/lexica/issues/31
            # Using the same line from here: https://raw.githubusercontent.com/PerseusDL/lexica/master/CTS_XML_TEI/perseus/pdllex/lat/ls/lat.ls.perseus-eng1.xml
            if "?xml version=\"1.0\"" in line:
                file.write(r'<!DOCTYPE TEI.2 PUBLIC "-//TEI P4//DTD Main DTD Driver File//EN" "http://www.tei-c.org/Vault/P4/xml/schema/dtd/tei2.dtd" [')
                file.write('\n')
                file.write(r'<!ENTITY % TEI.XML "INCLUDE">')
                file.write('\n')
                file.write(r'<!ENTITY % PersDict PUBLIC "-//Perseus P4//DTD Perseus Dictionaries//EN" "http://www.perseus.tufts.edu/DTD/1.0/PersDict.dtd">')
                file.write('\n')
                file.write(r'%PersDict;')
                file.write('\n')
                file.write(r']>')
                file.write('\n')

In [5]:
# checking if any catchphrases are left just in case
for filename in os.listdir(datasetPath):
    with open(datasetPath + '\\' + filename,'r') as file:
        lines = file.read().splitlines()
        for line in lines:
            if 'catchphrase' in line:
                print(line)

In [6]:
# testing to see if the case sentences can be accessed 
parser = etree.XMLParser(no_network=False, load_dtd=True)
tree = etree.parse(r'C:\Users\Philip\--- Data Science\corpus\fulltext\06_60.xml', parser)
root = tree.getroot()

sentences = root[2]
print(sentences[0].text)




 Contents 
 Par 
 Ms Campbell's complaints ...........................................................................


In [7]:
def getName(XMLRoot):
    return XMLRoot[0].text

In [8]:
def getCitations(XMLRoot):
    sentences = XMLRoot[2]
    citations = []
    for sentence in sentences:
        # Basically this regex searches for a number surounded by square brackets like such: [2004]
        # From the examples I looked at, all cited cases use this year format when citing another case. Massive assumption - I might be wrong!
        # It searches for numbers that are in the range 1500-2999: although this can be easily be changed
        # It seems to work pretty well, let's hope no one cites Magna Carta! (https://www.bbc.com/news/uk-england-leeds-55121434) 
        # My main concern is the lower border is too low, it might pick up some references like this one: ' ... at [20]-[24] and [38]'
        if re.search("(?=.*[\[])(?=.*[1][5-9][0-9][0-9]|[2][0-9][0-9][0-9])(?=.*[\]])",sentence.text) is not None:
            # Should remove empty lines, although some are still left for some reason.. no time to debug now
            # https://stackoverflow.com/questions/31412765/regex-to-remove-white-spaces-blank-lines-and-final-line-break-in-javascript
            citation = re.sub(r'/^(?=\n)$|^\s*|\s*$|\n\n+/gm', '', sentence.text)
            citations.append(citation)
    return citations

In [9]:
# just to have a look at the citations
#for citation in getCitations(root):
#    print(citation+"\n---------------")

In [10]:
def getCounselA(XMLRoot):
    sentences = XMLRoot[2]
    caseSize = len(sentences.findall('sentence'))
    
    counsels = []
    lines = sentences[caseSize-1].text.splitlines()
    # for each line in the last sentence, check for specific words that can identify which line holds what information
    for i in range(0,len(lines)):
        line = lines[i]
        # some people represent themselves. I think this might only be true for "counsel", but I left it in solicitor as well (jsut in case)
        if "in person" in line and "Counsel" in line:
            counsels.append("In person")
        # if the line has "counsel" and "applicant" or "appellant"
        elif "Counsel" in line:
            if "Applicant" in line or "Appellant" in line:
                #print(line)
                # remove everything before the colon clear whitespace and split the string (in case of multiple people)
                # with semi-colons, commas and the word "and"
                theRest = re.sub(r'^.*?:', '', line)
                theRest = theRest.strip()
                theRest = re.sub(r' and +?', ', ', theRest)
                #print(theRest)
                for person in re.split('; |, ', theRest):
                    counsels.append(person)
                
                # In case there are extra names on the next row, I saw it occur in a couple of files, not sure why
                # sometimes this might pick up some extra information, that is not a counsel, some more tweaks might be needed
                if lines[i+1].strip():
                    extra = re.sub(r'^.*?:', '', lines[i+1])
                    extra = re.sub(r' and +?', ', ', extra)
                    counsels.append(extra)
    return(counsels)
    
def getSolicitorA(XMLRoot):
    sentences = XMLRoot[2]
    caseSize = len(sentences.findall('sentence'))
    
    solicitors = []
    lines = sentences[caseSize-1].text.splitlines()
    for i in range(0,len(lines)):
        line = lines[i]
        if "in person" in line and "Solicitor" in line:
            solicitors.append("In person")
        elif "Solicitor" in line:
            if "Applicant" in line or "Appellant" in line:
                #print(line)
                theRest = re.sub(r'^.*?:', '', line)
                theRest = theRest.strip()
                theRest = re.sub(r' and +?', ', ', theRest)
                #print(theRest)
                for person in re.split('; |, ', theRest):
                    solicitors.append(person)
                
                if lines[i+1].strip():
                    extra = re.sub(r'^.*?:', '', lines[i+1])
                    extra = re.sub(r' and +?', ', ', extra)
                    solicitors.append(extra)
    return(solicitors)
    
def getCounselR(XMLRoot):
    sentences = XMLRoot[2]
    caseSize = len(sentences.findall('sentence'))
    
    counsels = []
    lines = sentences[caseSize-1].text.splitlines()
    for i in range(0,len(lines)):
        line = lines[i]
        if "in person" in line and "Counsel" in line:
            counsels.append("In person")
        elif "Counsel" in line:
            if "Respondent" in line:
                #print(line)
                theRest = re.sub(r'^.*?:', '', line)
                theRest = theRest.strip()
                theRest = re.sub(r' and +?', ', ', theRest)
                #print(theRest)
                for person in re.split('; |, ', theRest):
                    counsels.append(person)
                
                if lines[i+1].strip():
                    extra = re.sub(r'^.*?:', '', lines[i+1])
                    extra = re.sub(r' and +?', ', ', extra)
                    counsels.append(extra)
    return(counsels)
    
def getSolicitorR(XMLRoot):
    sentences = XMLRoot[2]
    caseSize = len(sentences.findall('sentence'))
    
    solicitors = []
    lines = sentences[caseSize-1].text.splitlines()
    for i in range(0,len(lines)):
        line = lines[i]
        if "in person" in line and "Solicitor" in line:
            solicitors.append("In person")
        elif "Solicitor" in line:
            if "Respondent" in line:
                #print(line)
                theRest = re.sub(r'^.*?:', '', line)
                theRest = theRest.strip()
                theRest = re.sub(r' and +?', ', ', theRest)
                #print(theRest)
                for person in re.split('; |, ', theRest):
                    solicitors.append(person)
                
                if lines[i+1].strip():
                    extra = re.sub(r'^.*?:', '', lines[i+1])
                    extra = re.sub(r' and +?', ', ', extra)
                    solicitors.append(extra)
    return(solicitors)

In [12]:
# testing the functions
parser = etree.XMLParser(no_network=False, load_dtd=True)
tree = etree.parse(datasetPath + '\\' + '06_60.xml', parser)
root = tree.getroot()
print(getCounselA(root))
print(getSolicitorA(root))
print(getCounselR(root))
print(getSolicitorR(root))

['Peter Hanks QC', 'Kate Eastman', 'Stephen Donaghue']
['Phillips Fox']
['Stephen Estcourt QC', 'Duncan Kerr SC', 'Greg Barns']
['Simmons Wolfgagen', 'Counsel for the State of Tasmania intervening']


In [None]:
# to check if the above is true
#print(sentences[size-1].text)

In [13]:
# this just works by picking up specific words in the last five rows of the case file that might indicate a decision
# I noticed that that's the place where the decision usually is. Not perfect, since sometimes it is hard to understand what 
# the decision is even if I read the file, let alone for an A.I. agent (e.g. when judges don't use specific words lile the ones in the if-statements)
def getDecision(XMLRoot):
    sentences = XMLRoot[2]
    caseSize = len(sentences.findall('sentence'))
    for i in range (caseSize-3, caseSize-8, -1):
        if "dismiss" in sentences[i].text:
            return "Case dismissed"
        elif "refused" in sentences[i].text:
            return "In favor of respondent"
        elif "granted" in sentences[i].text or "reconsider" in sentences[i].text or "acceede" in sentences[i].text:
            return "In favor of applicant"
        elif "allowed" in sentences[i].text and "appeal" in sentences[i].text:
            return "In facor of appellant"
    #if all else fails - unkown
    return "Unknown"

In [15]:
# testing
#print(getDecision(root))

In [14]:
# combining all the functions to create a dataFrame entry
def extractCase(parser, filename):
    tree = etree.parse(datasetPath + '\\' + filename, parser)
    root = tree.getroot()
    
    return({'File name' : filename, 'Case name' : getName(root), 'Citations' : getCitations(root), 
                    'Applicant\'s Councel': getCounselA(root), 'Applicant\'s Solicitors': getSolicitorA(root), 
                    'Respondent\'s Councel': getCounselR(root), 'Respondent\'s Solicitors': getSolicitorR(root), 
                    'Decision': getDecision(root)}) 

In [19]:
import datetime
import pandas as pd
import numpy as np

# creating the columns for a dataFrame entry
columns = ['File name','Case name', 'Citations', 'Applicant\'s Councel', 'Applicant\'s Solicitors', 'Respondent\'s Councel', 'Respondent\'s Solicitors', 'Decision']

# create a new dataframe and parser
df = pd.DataFrame(columns=columns)
parser = etree.XMLParser(no_network=False, load_dtd=True)

# for each file call the above function and save the output to the database
# THINGS TO NOTE
# 1) I'm only doing it for 100 entries, since this part is bottlenecked somehow, I believe it's making a lot of calls to the online DTD
# despite that the code above being pretty efficient and overall this is scalable, but a smarter way than the DTD should be implemented
# 2) There are *STILL* some XML entities that are not being recognised, even with the DTD, that's why there is a try-catch
# In this case, the tm symbol (tm); on top of getting a "This is not proper UTF-8" errors
for i in range(0, 100): #number of files to go though
    try:
        filename = os.listdir(datasetPath)[i]
        df = df.append(extractCase(parser, filename), ignore_index=True)
    except etree.XMLSyntaxError:
        continue
df.head(10)

Unnamed: 0,File name,Case name,Citations,Applicant's Councel,Applicant's Solicitors,Respondent's Councel,Respondent's Solicitors,Decision
0,06_1.xml,Sharman Networks Ltd v Universal Music Austral...,"[In the meantime, Ms Hemming had filed two dis...",[J M Ireland QC],[Clayton Utz],"[R Cobden SC, J M Hennessy]",[Gilbert + Tobin],Case dismissed
1,06_100.xml,Lawrance v Human Rights and Equal Opportunity ...,[],[],[],[K Eastman],"[Human Rights, Equal Opportunity Commission, A...",Unknown
2,06_1001.xml,Citrus Queensland Pty Ltd v Sunstate Orchards ...,"[' \n (ER 433, cf more recently von Doussa J i...",[Mr RA Perry SC],[Lynch & Co],[Mr PP McQuade],[Mc Cullough Robertson],Unknown
3,06_1004.xml,Martech International Pty Ltd v Energy World C...,[In Foakes v Beer [1884] 9 App Cas 605 the Hou...,[Mr DM Stone],[Williams & Hughes],[Mr P McGowan],[Christensen Vaughan],Unknown
4,06_1005.xml,Commissioner of Taxation v Milne (with Corrige...,[Thus for instance in NSW Associated Blue-Meta...,"[S Rushton SC, J D Smith]",[Australian Government Solicitor],"[B J Sullivan, C G Catt]",[NOT Lawyers],Case dismissed
5,06_1006.xml,SZCCX v Minister for Immigration & Multicultur...,[],[],[],[],[Sparke Helmore],Case dismissed
6,06_1015.xml,Douglas v Queensland [2006] FCA 1015 (8 August...,"[As was pointed out by Dawson, Gaudron and McH...",[D O'Gorman],[Robert Bax & Associates],"[JE Murdoch SC, CJ Murdoch]",[Crown Law],Unknown
7,06_1017.xml,Regional Publishers Pty Limited v Elkington [2...,[Having placed itself in the position of such ...,[],[],[],[],Unknown
8,06_1018.xml,SZFBU v Minister for Immigration and Multicult...,[9 The respondent submitted that even if a 'sp...,[],[],[Geoffrey Kennett],[Phillips Fox],In favor of applicant
9,06_102.xml,"Gidley, in the matter of Aliance Motor Body Pt...","[4 The nature, effect, and limit upon directio...",[IM Jackman SC],[Minter Ellison],[],[],Unknown


In [25]:
# if you want to have a look if the citations reference any actual cases:
for row in df['Citations']:
    for item in row:
        print(item)
        print('\n-------------------------')

In the meantime, Ms Hemming had filed two disclosure affidavits pursuant to Wilcox J's orders of 22 March 2005 whilst Sharman License and Sharman Networks had unsuccessfully sought several stays on various grounds of that same order insofar as it applied to them (see Universal Music Australia Pty Ltd v Sharman License Holdings Ltd [2005] FCA 406 per Hely J, delivered 8 April 2005; Universal Music Australia Pty Ltd v Sharman License Holdings Ltd [2005] FCA 441 per Wilcox J, delivered 15 April 2005 and Sharman License Holdings Ltd v Universal Music Australia Pty Ltd [2005] FCA 505 per Moore J, delivered 28 April 2005).

-------------------------
Sharman License and Sharman Networks had also unsuccessfully sought an enlargement of time in which to file an application for leave to appeal from Wilcox J's orders of 22 March 2005 (see Sharman License Holdings Ltd v Universal Music Australia Pty Ltd [2005] FCA 802 per Lindgren J, delivered on 17 June 2005).

-------------------------
3 On 24 M