#### Master of Data Science (Digital Humanities)
#### Final Project Notebooks

### DATA40345 Data Science Research Project

#### NLP Esquire: A Data-Driven Analysis and Categorisation of the Judgments of the United Kingdom Supreme Court

## Notebook 1: Data Collection and Data Preparation

### DATA COLLECTION
Scraping UK Supreme Court judgements from The Supreme Court website
(https://www.supremecourt.uk/decided-cases/index.html)

In [1]:
#importing required libraries

import requests
from bs4 import BeautifulSoup
import time #to add sleep

In [2]:
#Judgements from 2009-2023 available online
#for loop to get links to the pages with the judgments (for each year)

urlWebsite = "https://www.supremecourt.uk/decided-cases/" 
urls = [] #empty list to store full url to each year

url_index = "https://www.supremecourt.uk/decided-cases/index.html" #includes the latest judgments

for i in range(2009,2023):
    strUrl = urlWebsite + str(i) + ".html"
    print(strUrl)
    urls.append(strUrl)

https://www.supremecourt.uk/decided-cases/2009.html
https://www.supremecourt.uk/decided-cases/2010.html
https://www.supremecourt.uk/decided-cases/2011.html
https://www.supremecourt.uk/decided-cases/2012.html
https://www.supremecourt.uk/decided-cases/2013.html
https://www.supremecourt.uk/decided-cases/2014.html
https://www.supremecourt.uk/decided-cases/2015.html
https://www.supremecourt.uk/decided-cases/2016.html
https://www.supremecourt.uk/decided-cases/2017.html
https://www.supremecourt.uk/decided-cases/2018.html
https://www.supremecourt.uk/decided-cases/2019.html
https://www.supremecourt.uk/decided-cases/2020.html
https://www.supremecourt.uk/decided-cases/2021.html
https://www.supremecourt.uk/decided-cases/2022.html


In [72]:
print(urls)

['https://www.supremecourt.uk/decided-cases/2009.html', 'https://www.supremecourt.uk/decided-cases/2010.html', 'https://www.supremecourt.uk/decided-cases/2011.html', 'https://www.supremecourt.uk/decided-cases/2012.html', 'https://www.supremecourt.uk/decided-cases/2013.html', 'https://www.supremecourt.uk/decided-cases/2014.html', 'https://www.supremecourt.uk/decided-cases/2015.html', 'https://www.supremecourt.uk/decided-cases/2016.html', 'https://www.supremecourt.uk/decided-cases/2017.html', 'https://www.supremecourt.uk/decided-cases/2018.html', 'https://www.supremecourt.uk/decided-cases/2019.html', 'https://www.supremecourt.uk/decided-cases/2020.html', 'https://www.supremecourt.uk/decided-cases/2021.html', 'https://www.supremecourt.uk/decided-cases/2022.html']


In [None]:
#to obtain the judgments in each case:
#obtaining the relative url to each case in each year

case_links = [] #empty list to store the (relative) links to each case in each year

for i in urls:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    test_links = soup.find_all(class_="fourthColumn")
    #print(test_links)

    for link in test_links:
        case = link.find("a", class_="more") #get relative link
        if (case != None):
            #print(case["href"])
        case_links.append(case["href"])
        time.sleep(2)

#all webpages with specific years (2022-2009) have identical html.

In [6]:
len(case_links)

1031

While 2009-2022 judgments are in webpages that have identical hmtl coding, the most recent judgments i.e.2023 judgments are in a webpage with different html. Specifically class="fifthColumn" and not "fourthColumn". Therefore, 2023 needs to be scraped separately. 

In [None]:
#the latest judgments i.e. 2023 judgments are in a webpage with different html.
#specifically class="fifthColumn" and not "fourthColumn"
#therefore, 2023 needs to be scraped separately.
#get links for index page
url_index = "https://www.supremecourt.uk/decided-cases/index.html" #includes the latest judgments

#to obtain the judgments in each case in index year:
#obtaining the relative url to each case:

page = requests.get(url_index)
soup = BeautifulSoup(page.content,"html.parser")

test_links = soup.find_all(class_="fifthColumn")

for link in test_links:
    case = link.find("a", class_="more") #get relative link
    if (case != None):
        #print(case["href"])
    case_links.append(case["href"])
    time.sleep(2)

In [8]:
len(case_links)

1061

In [None]:
#preparing the global url for each case

globalurls = [] #empty list to store the global urls
globurl = "https://www.supremecourt.uk"

for i in case_links:
    caseLink = globurl + i #appending the relative links to the global url
    #print(caseLink)
    globalurls.append(caseLink) #appending all global urls to one list

In [11]:
len(globalurls)

1061

Obtaining the Judgment Links

In [None]:
judgment_links = []
for i in globalurls:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    text = soup.find("a", title="Judgment (PDF)")
    if (text != None):
        #print(text["href"]) 
        judgment_links.append(text["href"])
    else:
        print("The Judgment in", i, "is not available")
    time.sleep(3) #sleep is used to provide sufficient time between requests to not overload the server

In [6]:
len(judgment_links)

1024

In [24]:
#standardising links

a = "https://www.supremecourt.uk/cases/"
b = "https://www.supremecourt.uk/"

#preparing the global urls for the judgments
judgmentUrls = [] #list to store the global urls to all the judgement PDFs

for i in judgment_links:
    if "/cases/" in i:
        judgmentUrls.append(b + i)
    else:
        judgmentUrls.append(a + i)

In [25]:
len(judgmentUrls)

1054

In [3]:
len(judgmentUrls)

1054

In [8]:
#issues noted during scraping:
    #some of the urls have "/cases/docs/" while others have only "docs/"
    #one link is scraped with spaces
    #therefore, the relative url is standardised below

standardised_links = []

for i in judgmentUrls:
    #standardised_link = i.replace("/cases/", "")
    standardised_link2 = i.replace("https://www.supremecourt.uk/cases/docs/uksc-2018-0091-judgment minus restrictions.pdf","https://www.supremecourt.uk/cases/docs/uksc-2018-0091-judgment%20minus%20restrictions.pdf" )
    standardised_links.append(standardised_link2)

In [9]:
len(standardised_links)

1054

In [None]:
#preparing the global urls for the judgments

global_judgementUrl = "https://www.supremecourt.uk/cases/"

judgmentUrls = [] #list to store the global urls to all the judgement PDFs

for i in standardised_links:
    judgmentLink = global_judgementUrl + i #appending the relative links to the global url
    judgmentUrls.append(judgmentLink) #appending all global urls to one list

In [None]:
#for loop to save the pdf files 

import urllib.request
import os

uksc_folder = "./NEW UKSC PDFMINER"
if not os.path.exists(uksc_folder): #check if "Dataset_UKSC folder exists"
    os.makedirs(uksc_folder) #if it does not, create a new folder called titled Dataset_UKSC

for i in standardised_links:
    filename = i.split("/")[-1] #[-1 indicates that the last portion divided by "/" is to be used to name]
    filepath = os.path.join(uksc_folder, filename)
    urllib.request.urlretrieve(i, filepath)
    time.sleep(3)

#https://www.tutorialspoint.com/downloading-files-from-web-using-python
#used to identify how to extract filename from link

#https://docs.python.org/3/library/urllib.request.html

#### Scraping metadata of UK Supreme Court judgements

In [None]:
#extract data for quantitative analysis
#the html coding for index year (2023), 2022, and 2021 are identical
#it also has the same information.
#subsequent years do not have as much information.
#therefore, the case links for each case 2023-2021 are first extracted

In [5]:
#extracting links for 2022 and 2021 first as 2023(index) has a different html structure

#for loop to get links to the pages with the judgments (for each year)

urlWebsite = "https://www.supremecourt.uk/decided-cases/" 
urls_22_21 = [] #empty list to store full url to each year

#url_index = "https://www.supremecourt.uk/decided-cases/index.html" #includes the latest judgments

for i in range(2021,2023):
    strUrl = urlWebsite + str(i) + ".html"
    print(strUrl)
    urls_22_21.append(strUrl)

https://www.supremecourt.uk/decided-cases/2021.html
https://www.supremecourt.uk/decided-cases/2022.html


In [6]:
len(urls_22_21)

2

In [None]:
#to obtain the judgments in each case in 2022 and 2021
#obtaining the relative url to each case in each year

case_links_22_21 = [] #empty list to store the (relative) links to each case in each year

for i in urls_22_21:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    test_links = soup.find_all(class_="fourthColumn")
    #print(test_links)

    for link in test_links:
        case = link.find("a", class_="more") #get relative link
        if (case != None):
            #print(case["href"])
        case_links_22_21.append(case["href"])
        time.sleep(2)

#all webpages with specific years (2022-2009) have identical html.

In [8]:
len(case_links_22_21)

120

In [10]:
#the latest judgments i.e. 2023 judgments are in a webpage with different html.
#specifically class="fifthColumn" and not "fourthColumn"
#therefore, 2023 needs to be scraped separately.
#get links for index page
url_index = "https://www.supremecourt.uk/decided-cases/index.html" #includes the latest judgments

#to obtain the judgments in each case in index year:
#obtaining the relative url to each case:

page = requests.get(url_index)
soup = BeautifulSoup(page.content,"html.parser")

test_links = soup.find_all(class_="fifthColumn")

for link in test_links:
    case = link.find("a", class_="more") #get relative link
    if (case != None):
        print(case["href"])
    case_links_22_21.append(case["href"])
    time.sleep(2)

/cases/uksc-2021-0078.html
/cases/uksc-2021-0087.html
/cases/uksc-2021-0149.html
/cases/uksc-2022-0075.html
/cases/uksc-2020-0208.html
/cases/uksc-2021-0195.html
/cases/uksc-2021-0159.html
/cases/uksc-2021-0125.html
/cases/uksc-2021-0038.html
/cases/uksc-2021-0144.html
/cases/uksc-2021-0138.html
/cases/uksc-2021-0056.html
/cases/uksc-2021-0050.html
/cases/uksc-2021-0089.html
/cases/uksc-2022-0056.html
/cases/uksc-2022-0052.html
/cases/uksc-2021-0216.html
/cases/uksc-2018-0192.html
/cases/uksc-2018-0191.html
/cases/uksc-2021-0019.html
/cases/uksc-2021-0188.html
/cases/uksc-2021-0031.html
/cases/uksc-2021-0047.html
/cases/uksc-2021-0059.html
/cases/uksc-2022-0089.html
/cases/uksc-2022-0093.html
/cases/uksc-2020-0056.html
/cases/uksc-2020-0002.html
/cases/uksc-2021-0027.html
/cases/uksc-2021-0028.html


In [11]:
len(case_links_22_21)

150

In [12]:
#preparing the global url for each case

globalurls1 = [] #empty list to store the global urls
globurl = "https://www.supremecourt.uk"

for i in case_links_22_21:
    caseLink = globurl + i #appending the relative links to the global url
    globalurls1.append(caseLink) #appending all global urls to one list

In [13]:
len(globalurls1)

150

In [None]:
#the years 2023-2021 have the same html code + information.
#globalurls1 contains the case links to all cases in 2023-2021

case_ref = [] 
first_part = "https://www.supremecourt.uk/cases/"
last_part = ".html"

case_name = [] #empty list to store the names of the parties
case_ID = [] #empty list to store the Case ID (i.e. the case in which the judgment is delivered)
issue = []
judgment_appealed = []
justices = []
hearing_start = []
hearing_finish = []
judgment_date = [] #empty list to store the date of delivery of judgment
citation = []
press_summary = []

for i in globalurls1:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    mod_ref = i.replace(first_part, "").replace(last_part, "")
    print(mod_ref)
    case_ref.append(mod_ref)
    
    parties = soup.find("h2")
    if(parties !=None):
        for name in parties:
            name1 = name.text.strip()
            #print(name1)
            case_name.append(name1)
            time.sleep(2)
    else:
        case_name.append("N/A")
    
    ids = soup.find("h3", class_="sc-access")
    if (ids !=None):
        for x in ids:
            ids1 = x.text.strip()
            #print(ids1)
            case_ID.append(ids1)
            time.sleep(2)
    else:
        case_ID.append("N/A")
    
    issues = soup.find("h4", text="Issue")
    if(issues !=None):
        for x in issues:
            para = issues.find_next_sibling("p")
            text = para.text
            #print(text)    
            issue.append(text)
            time.sleep(2)
    else:
        issue.append("N/A")
    
    appeal = soup.find("h4", text="Judgment appealed")
    if(appeal !=None):
        for x in appeal:
            para = appeal.find_next_sibling("p")
            text = para.text
            #print(text)    
            judgment_appealed.append(text)
            time.sleep(2)
    else:
        judgment_appealed.append("N/A")
           
    judges = soup.find("h4", text="Justices")
    if(judges !=None):
        for x in judges:
            para = judges.find_next_sibling("p")
            text = para.text
            #print(text)    
            justices.append(text)
            time.sleep(2)
    else:
        justices.append("N/A")
    
    start = soup.find("h4", text="Hearing start date")
    if(start !=None):
        for x in start:
            para = start.find_next_sibling("p")
            text = para.text
            #print(text)    
            hearing_start.append(text)
            time.sleep(2)
    else:
        hearing_start.append("N/A")
    
    finish = soup.find("h4", text="Hearing finish date")
    if(finish !=None):
        for x in finish:
            para = finish.find_next_sibling("p")
            text = para.text
            #print(text)    
            hearing_finish.append(text)
            time.sleep(2)
    else:
        hearing_finish.append("N/A")
    
    judgment = soup.find("h4", text="Judgment date")
    if(judgment !=None):
        for x in judgment:
            para = judgment.find_next_sibling("p")
            text = para.text
            #print(text)    
            judgment_date.append(text)
            time.sleep(2)
    else:
        judgment_date.append("N/A")

    cite = soup.find("h4", text="Neutral citation")
    if(cite !=None): 
        for x in cite:
            para = cite.find_next_sibling("p")
            text = para.text
            #print(text)    
            citation.append(text)
            time.sleep(2)
    else:
        citation.append("N/A")

In [197]:
#ensure that the same no.of data for each list has been scraped.
print("No.of Case References: ", len(case_ref))
print("No.of Case Names: ", len(case_name))
print("No.of Case IDs: ", len(case_ID))
print("No.of Case Issues: ", len(issue))
print("No.of Judgments Appealed: ", len(judgment_appealed))
print("No.of Case Judges: ", len(justices))
print("No.of Hearing Start Dates: ", len(hearing_start))
print("No.of Hearing End Dates: ", len(hearing_finish))
print("No.of Judgement Delivery Dates: ", len(judgment_date))
print("No.of Case Neutral Citations: ", len(citation))

No.of Case References:  148
No.of Case Names:  148
No.of Case IDs:  148
No.of Case Issues:  148
No.of Judgments Appealed:  148
No.of Case Judges:  148
No.of Hearing Start Dates:  148
No.of Hearing End Dates:  148
No.of Judgement Delivery Dates:  148
No.of Case Neutral Citations:  148


In [255]:
#ensure that the same no.of data for each list has been scraped.
print("No.of Case References: ", len(case_ref))
print("No.of Case Names: ", len(case_name))
print("No.of Case IDs: ", len(case_ID))
print("No.of Case Issues: ", len(issue))
print("No.of Judgments Appealed: ", len(judgment_appealed))
print("No.of Case Judges: ", len(justices))
print("No.of Hearing Start Dates: ", len(hearing_start))
print("No.of Hearing End Dates: ", len(hearing_finish))
print("No.of Judgement Delivery Dates: ", len(judgment_date))
print("No.of Case Neutral Citations: ", len(citation))

No.of Case References:  148
No.of Case Names:  148
No.of Case IDs:  148
No.of Case Issues:  148
No.of Judgments Appealed:  148
No.of Case Judges:  148
No.of Hearing Start Dates:  148
No.of Hearing End Dates:  148
No.of Judgement Delivery Dates:  148
No.of Case Neutral Citations:  148


In [256]:
#store data of 2021-2023 in a df (df1)
import pandas as pd
df1 = pd.DataFrame(list(zip(case_ref, case_name, case_ID, issue, judgment_appealed, justices, hearing_start, hearing_finish, judgment_date, citation)),
                         columns = ["Case Ref", "Parties", "ID", "Issue", "Appealed Judgment", "Names of Judges", "Hearing Start Date", "Hearing End Data", "Date of Judgment", "Citation"])

In [257]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Case Ref            148 non-null    object
 1   Parties             148 non-null    object
 2   ID                  148 non-null    object
 3   Issue               148 non-null    object
 4   Appealed Judgment   148 non-null    object
 5   Names of Judges     148 non-null    object
 6   Hearing Start Date  148 non-null    object
 7   Hearing End Data    148 non-null    object
 8   Date of Judgment    148 non-null    object
 9   Citation            148 non-null    object
dtypes: object(10)
memory usage: 11.7+ KB


In [258]:
#save dataframe to csv for subsequent use
df1.to_csv("UKSC 2021-23 Additional Data.csv", encoding="utf-8", index=False) #removes the index column

In addition to scraping data from the individual judgment pages, data is also scraped from the press summaries as these provide the decision clearly.

In [14]:
first_part = "https://www.supremecourt.uk/cases/"
last_part = ".html"

In [None]:
#getting the links to the press summary webpages
case_ref4 = []
press_summary = []

for i in globalurls1:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    mod_ref = i.replace(first_part, "").replace(last_part, "")
    print(mod_ref)
    case_ref4.append(mod_ref)

#the press summary is coded in html in 4 different variations.
#the following if-else is created to account for all 4 variations
    press = soup.find("a", title="Press Summary")
    if (press != None):
        #print(press["href"])
        press_link = globurl + press["href"]
        #print(press_link)
        press_summary.append(press_link)
        time.sleep(1)
    else:
        press1 = soup.find("a", title="Press summary (HTML Version)")
        if (press1 != None):
            press_link = globurl + press1["href"]
            #print(press_link)
            press_summary.append(press_link)
            time.sleep(1)
        else: 
            press2 = soup.find("a", title="Press summary HTML version")
            if (press2 !=None):
                press_link = globurl + press2["href"]
                #print(press_link)
                press_summary.append(press_link)
                time.sleep(1)
            else:
                press3 = soup.find("a", title="Press Summary HTML version")
                if (press3 !=None):
                    press_link = globurl + press3["href"]
                    #print(press_link)
                    press_summary.append(press_link)
                    time.sleep(1)
                else:
                    print("Press Summary for Case", mod_ref, "is not available")
                    time.sleep(1)

In [21]:
len(press_summary)

149

In [None]:
#extract only the decision from press summary html 2021-2023
case_ref_decision = [] 
first_part1 = "https://www.supremecourt.uk/press-summary/"
decision_only = []

for i in press_summary:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    mod_caseref = i.replace(first_part1, "").replace(last_part, "")
    #print(mod_caseref)
    case_ref_decision.append(mod_caseref)
    time.sleep(1)
    
    decision = soup.find(class_="sc-access caption", text="Judgment")
    if (decision !=None):
        for x in decision:
            para = decision.find_next_sibling("p")
            text = para.text
            #print("THIS IS THE DECISION IN", mod_caseref, ":", text)
            decision_only.append(text)
            time.sleep(2)
    else:
        decision_only.append("N/A")

In [23]:
len(case_ref_decision)

149

In [24]:
len(decision_only)

149

In [25]:
#store decision in press summaries 2020-2023 in df6
import pandas as pd
df6 = pd.DataFrame(list(zip(case_ref_decision, decision_only)),
                         columns = ["Case Ref", "Decision"])

In [26]:
df6.head(5)

Unnamed: 0,Case Ref,Decision
0,uksc-2021-0160,"Jointly, Lord Briggs, Lord Kitchin, Lord Burro..."
1,uksc-2020-0195,The Supreme Court unanimously allows the appea...
2,uksc-2020-0081,\r\nThe Supreme Court unanimously dismisses th...
3,uksc-2020-0029,The Supreme Court allows the appeals by the Ch...
4,uksc-2020-0029,The Supreme Court allows the appeals by the Ch...


In [29]:
#save dataframe to csv for subsequent use
df6.to_csv("UKSC 2021-23 Decision Data.csv", encoding="utf-8", index=False) #removes the index column

In [250]:
#store data of press summaries 2020-2023 in df5
import pandas as pd
df5 = pd.DataFrame(list(zip(case_ref2, press_judges, background_info, reasons, decisions)),
                         columns = ["Case Ref", "Names of Judges", "Background", "Reasons", "Decision"])

In [251]:
df5

Unnamed: 0,Case Ref,Names of Judges,Background,Reasons,Decision
0,uksc-2021-0160,"Lord Briggs, Lady Arden, Lord Kitchin, Lord Bu...",Mr Crosland disclosed the outcome of a judgmen...,Jurisdiction,"Jointly, Lord Briggs, Lord Kitchin, Lord Burro..."
1,uksc-2020-0195,"Lord Reed (President), Lord Hodge (Deputy Pres...","The following issues arise. First, whether the...",The majority holds that section 13 of the Admi...,The Supreme Court unanimously allows the appea...
2,uksc-2020-0081,"Lord Reed (President), Lord Lloyd-Jones, Lady ...","Jointly, Lord Briggs, Lord Kitchin, Lord Burro...",Lady Arden considers that the Supreme Court do...,\r\nThe Supreme Court unanimously dismisses th...
3,uksc-2020-0029,"Lord Hodge (Deputy President), Lord Lloyd-Jone...",Jurisdiction,Merits,The Supreme Court allows the appeals by the Ch...
4,uksc-2020-0029,"Lord Hodge (Deputy President), Lord Lloyd-Jone...",The majority holds that section 13 of the Admi...,The First Instance Panel made no material erro...,The Supreme Court allows the appeals by the Ch...
...,...,...,...,...,...
101,uksc-2020-0206,"Lord Hodge (Deputy President), Lord Briggs, Lo...","The other appeal (the ""Hooded Men case"") relat...",Lord Stephens considers each of the grounds of...,The Supreme Court unanimously allows FAAN’s ap...
102,uksc-2020-0138,"Lord Reed (President), Lord Lloyd-Jones, Lady ...","In 2014, the Irish national broadcaster, RTÈ, ...",Ground 1 [86 – 91],The Supreme Court unanimously allows the appea...
103,uksc-2020-0113,"Lord Hodge (Deputy President), Lord Sales, Lor...",The Supreme Court allows the appeals by the Ch...,The Appellant argued that it was wrong to reca...,The Supreme Court unanimously allows the appea...
104,uksc-2020-0113,"Lord Hodge (Deputy President), Lord Sales, Lor...",The first issue in both appeals concerns the t...,The Court rejected this interpretation of the ...,The Supreme Court unanimously allows the appea...


In [254]:
#save dataframe to csv for subsequent use
df5.to_csv("UKSC 2021-23 Additional Press Data.csv", encoding="utf-8", index=False) #removes the index column

Next, it is necessary to adjust the code to continue collecting data. For years 2017-2020 (4 years) much of the metadata is not available. Only the following are available:
1. Judgment date
2. Neutral citation number
3. Case ID
4. Justices
5. Judgment PDF
6. Press Summary (PDF) - not available in HTML unlike in 2021-2023

However, the hearing dates can be extracted as the dates are provided along with the video footage.

In [1]:
#adjusting the code to extract links for 2017-2020
#for loop to get links to the pages with the judgments (for each year)

urlWebsite = "https://www.supremecourt.uk/decided-cases/" 
urls_20_17 = [] #empty list to store full url to each year

#url_index = "https://www.supremecourt.uk/decided-cases/index.html" #includes the latest judgments

for i in range(2017,2021):
    strUrl = urlWebsite + str(i) + ".html"
    print(strUrl)
    urls_20_17.append(strUrl)

https://www.supremecourt.uk/decided-cases/2017.html
https://www.supremecourt.uk/decided-cases/2018.html
https://www.supremecourt.uk/decided-cases/2019.html
https://www.supremecourt.uk/decided-cases/2020.html


In [2]:
len(urls_20_17)

4

In [None]:
#to obtain the judgments in each case in 2017-2020
#obtaining the relative url to each case in each year

case_links_20_17 = [] #empty list to store the (relative) links to each case in each year

for i in urls_20_17:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    test_links = soup.find_all(class_="fourthColumn")
    #print(test_links)

    for link in test_links:
        case = link.find("a", class_="more") #get relative link
        if (case != None):
            #print(case["href"])
        case_links_20_17.append(case["href"])
        time.sleep(2)

#all webpages with specific years (2022-2009) have identical html.

In [5]:
len(case_links_20_17)

317

In [7]:
#preparing the global url for each case

globalurls2 = [] #empty list to store the global urls
globurl = "https://www.supremecourt.uk"

for i in case_links_20_17:
    caseLink = globurl + i #appending the relative links to the global url
    globalurls2.append(caseLink) #appending all global urls to one list

In [8]:
len(globalurls2)

317

In [None]:
globalurls2

Extracting data available for the years 2017-2020.

In [None]:
#the years 2017-2020 have the same html code + information.
#globalurls2 contains the case links to all cases in 2017-2020

case_ref1 = [] 
first_part = "https://www.supremecourt.uk/cases/"
last_part = ".html"

case_name1 = [] #empty list to store the names of the parties
case_ID1 = [] #empty list to store the Case ID (i.e. the case in which the judgment is delivered)
#issue = []
#judgment_appealed = []
justices1 = []
hearing_start1 = []
hearing_finish1 = []
judgment_date1 = [] #empty list to store the date of delivery of judgment
citation1 = []
press_summary1 = []

for i in globalurls2:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    mod_ref = i.replace(first_part, "").replace(last_part, "")
    print(mod_ref)
    case_ref1.append(mod_ref)
    
    ids = soup.find("h5", text="Case ID")
    if (ids !=None):
        for x in ids:
            para = ids.find_next_sibling("p")
            #ids1 = x.text.strip()
            text = para.text
            #print("Case ID of", mod_ref, "is", text)
            case_ID1.append(text)
            time.sleep(1)
    else: 
        ids1 = soup.find("h3", class_="sc-access", text="Case ID")
        if (ids1 !=None):
            for x in ids1:
                para = ids1.find_next_sibling("p")
                text = para.text
                #print("Case ID of", mod_ref, "is", text)
                case_ID1.append(text)
                time.sleep(1)
            else:
                case_ID1.append("N/A")
    time.sleep(2)
        
    judges1 = soup.find("h3", class_="sc-access", text="Justices")
    if (judges1 !=None):
        for x in judges1:
            para = judges1.find_next_sibling("p")
            text = para.text
            #print("Justices names in", mod_ref, "are", text)
            justices1.append(text)
            time.sleep(1)
    else:
        judges = soup.find("h5", text="Justices")
        if(judges !=None):
            for x in judges:
                para = judges.find_next_sibling("p")
                text = para.text
                #print("Justices names in", mod_ref, "are", text)    
                justices1.append(text)
                time.sleep(1)
        else:             
            justices1.append("N/A")
    time.sleep(2)

    start = soup.find("table", class_="listing video-hearing")
    if(start !=None):
        tr = start.find("tr", class_="odd")
        if(tr !=None):
            td = tr.find("td")
            if (td !=None):
                text = td.text
                #print("Hearing start date in", mod_ref, "is", text)    
                hearing_start1.append(text)
                time.sleep(1)
            else:
                hearing_start1.append("N/A")
        else:
            hearing_start1.append("N/A")
    else:
        hearing_start1.append("N/A")
    time.sleep(2)
    
    finish = soup.find("table", class_="listing video-hearing")
    if(finish !=None):
        tr = finish.find_all("tr", class_="odd")
        if len(tr) >=2: #if there is more than 1 <tr> tag (which would be the case if the hearing went on for more than one day)
            tr2 = tr[1] #extract the second <tr> tag
            td = tr2.find("td")
            if (td !=None):
                for m in td:
                    text = m.text
                    #print("Hearing finish date in", mod_ref, "is", text)    
                    hearing_finish1.append(text)
                    time.sleep(1)
            else:
                hearing_finish1.append("N/A")
        else:
            hearing_finish1.append("N/A")
    else:
        hearing_finish1.append("N/A")
    time.sleep(2)
    
    judgment1 = soup.find("h3", class_="sc-access", text="Judgment date")
    if (judgment1 != None):
        for x in judgment1:
            para = judgment1.find_next_sibling("p")
            text = para.text
            #print("Judgment date in", mod_ref, "is", text)
            judgment_date1.append(text)
            time.sleep(1)
    else:            
        judgment = soup.find("h5", text="Judgment date")
        if(judgment !=None):
            for x in judgment:
                para = judgment.find_next_sibling("p")
                text = para.text
                #print("Judgment date in", mod_ref, "is", text)    
                judgment_date1.append(text)
                time.sleep(1)
        else:
            judgment_date1.append("N/A")
    time.sleep(2)
        
    cite = soup.find("h5", text="Neutral citation number") #text is different
    if(cite !=None): 
        for x in cite:
            para = cite.find_next_sibling("p")
            text = para.text
            #print("Neutral citation in", mod_ref, "is", text)    
            citation1.append(text)
            time.sleep(1)
    else:
        cite1 = soup.find("h3", class_="sc-access", text="Neutral citation number")
        if (cite1 != None):
            para = cite1.find_next_sibling("p")
            text = para.text
            #print("Neutral citation in", mod_ref, "is", text)
            citation1.append(text)
            time.sleep(1)
        else:
            citation1.append("N/A")
    time.sleep(2)
    
    press = soup.find("a", title="Press Summary (PDF)")
    if (press != None):
        #print(press["href"])
        press_link = globurl + press["href"]
        #print(press_link)
        press_summary1.append(press_link)
    else:
        press_summary1.append("N/A")
    time.sleep(2)

In [10]:
#ensure that the same no.of data for each list has been scraped.
print("No.of Case References: ", len(case_ref1))
print("No.of Case IDs: ", len(case_ID1))
print("No.of Case Judges: ", len(justices1))
print("No.of Hearing Start Dates: ", len(hearing_start1))
print("No.of Hearing End Dates: ", len(hearing_finish1))
print("No.of Judgement Delivery Dates: ", len(judgment_date1))
print("No.of Case Neutral Citations: ", len(citation1))
print("No.of Press Summary Links: ", len(press_summary1))

No.of Case References:  317
No.of Case IDs:  374
No.of Case Judges:  317
No.of Hearing Start Dates:  317
No.of Hearing End Dates:  317
No.of Judgement Delivery Dates:  317
No.of Case Neutral Citations:  317
No.of Press Summary Links:  317


Extracting party names 2017-2020

In [5]:
print(urls_20_17)

['https://www.supremecourt.uk/decided-cases/2017.html', 'https://www.supremecourt.uk/decided-cases/2018.html', 'https://www.supremecourt.uk/decided-cases/2019.html', 'https://www.supremecourt.uk/decided-cases/2020.html']


In [None]:
#extract party names in each case in each year 2017-2020
party_names1 = []
case_ref4 = []
first_part1 = "/cases/"
last_part1 = ".html"

for i in urls_20_17:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")
    
    names = soup.find_all(class_="fourthColumn")
    for name in names:
        name1 = name.text.strip()
        #print(name1)
        party_names1.append(name1)
        time.sleep(2)
    time.sleep(1)

In [None]:
#extract case reference
for i in urls_20_17:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")
    
    test_links = soup.find_all(class_="fourthColumn")
    
    for link in test_links:
        case = link.find("a", class_="more") #get relative link
        if (case != None):
            case2 = case["href"]
            #print(case["href"])
            mod_ref = case2.replace(first_part1, "").replace(last_part1, "")
            #print(mod_ref)
            case_ref4.append(mod_ref)
        time.sleep(2)

In [12]:
#ensure that the same no.of data for each list has been scraped.
#print("No.of Case References: ", len(case_ref4))
print("No.of Party Names: ", len(party_names1))

No.of Party Names:  317


In [20]:
#store data of party names + case references of 2017-2020 in a df (df1)
import pandas as pd

#store data of 2017-2020 in a df (df1)
import pandas as pd
df6 = pd.DataFrame(list(zip(case_ref4, party_names1)),
                         columns = ["Case Ref", "Party Names"])

In [21]:
df6.head(10)

Unnamed: 0,Case Ref,Party Names
0,uksc-2016-0209,R (on the application of Hysaj and others) (Ap...
1,uksc-2016-0070,R (on the application of Black) (Appellant) v ...
2,uksc-2016-0045,Four Seasons Holdings Incorporated (Respondent...
3,uksc-2015-0175,Four Seasons Holdings Incorporated (Appellant)...
4,uksc-2016-0190,CPRE Kent (Respondent) v China Gateway Interna...
5,uksc-2016-0188,Dover District Council (Appellant) v CPRE Kent...
6,uksc-2016-0174,O'Connor (Appellant) v Bar Standards Board (Re...
7,uksc-2016-0156,Tiuta International Limited (in liquidation) (...
8,uksc-2017-0025,Scotch Whisky Association and others (Appellan...
9,uksc-2016-0142,"Gordon and others, as the Trustees of the Inte..."


In [13]:
#store data of 2017-2020 in a df (df3)
import pandas as pd
df3 = pd.DataFrame(list(zip(case_ref1, case_ID1, party_names1, justices1, hearing_start1, hearing_finish1, judgment_date1, citation1)),
                         columns = ["Case Ref", "ID", "Party Names", "Names of Judges", "Hearing Start Date", "Hearing End Data", "Date of Judgment", "Citation"])

In [17]:
#save dataframe to csv for subsequent use
df3.to_csv("UKSC 2017-20 Additional Complete Data.csv", encoding="utf-8", index=False) #removes the index column

### Obtaining the Press Summaries in PDF

In [2]:
#Judgements from 2009-2023 available online
#for loop to get links to the pages with the judgments (for each year)

urlWebsite = "https://www.supremecourt.uk/decided-cases/" 
urls = [] #empty list to store full url to each year

#url_index = "https://www.supremecourt.uk/decided-cases/index.html" #includes the latest judgments

for i in range(2009,2021):
    strUrl = urlWebsite + str(i) + ".html"
    print(strUrl)
    urls.append(strUrl)

https://www.supremecourt.uk/decided-cases/2009.html
https://www.supremecourt.uk/decided-cases/2010.html
https://www.supremecourt.uk/decided-cases/2011.html
https://www.supremecourt.uk/decided-cases/2012.html
https://www.supremecourt.uk/decided-cases/2013.html
https://www.supremecourt.uk/decided-cases/2014.html
https://www.supremecourt.uk/decided-cases/2015.html
https://www.supremecourt.uk/decided-cases/2016.html
https://www.supremecourt.uk/decided-cases/2017.html
https://www.supremecourt.uk/decided-cases/2018.html
https://www.supremecourt.uk/decided-cases/2019.html
https://www.supremecourt.uk/decided-cases/2020.html


In [None]:
#to obtain the judgments in each case:
#obtaining the relative url to each case in each year

case_links4 = [] #empty list to store the (relative) links to each case in each year

for i in urls:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")

    test_links = soup.find_all(class_="fourthColumn")
    #print(test_links)

    for link in test_links:
        case = link.find("a", class_="more") #get relative link
        if (case != None):
            #print(case["href"])
        case_links4.append(case["href"])
        time.sleep(2)

#all webpages with specific years (2022-2009) have identical html.

In [4]:
len(case_links4)

911

In [5]:
#preparing the global url for each case

globalurls4 = [] #empty list to store the global urls
globurl = "https://www.supremecourt.uk"

for i in case_links4:
    caseLink = globurl + i #appending the relative links to the global url
    globalurls4.append(caseLink) #appending all global urls to one list

In [6]:
len(globalurls4)

911

In [None]:
#having prepared the global urls to each case, the link to the press summary of each judgment in each case is extracted: 

press_pdf = []

for i in globalurls4:
    page = requests.get(i)
    soup = BeautifulSoup(page.content,"html.parser")
       
    text = soup.find("a", title="Press summary (PDF)")
    if (text != None):
        #print(text["href"]) 
        press_pdf.append(text["href"])
    else:
        print("The Press Summary (PDF) in", text, "is not available")
    time.sleep(3) #sleep is used to provide sufficient time between requests to not overload the server

In [9]:
#preparing the global urls for the press summaries

global_pressUrl = "https://www.supremecourt.uk/cases/"

pressUrls = [] #list to store the global urls to all the judgement PDFs

for i in press_pdf:
    pressLink = global_pressUrl + i #appending the relative links to the global url
    pressUrls.append(pressLink) #appending all global urls to one list

In [10]:
len(pressUrls)

903

In [None]:
#for loop to save the pdf files 

import urllib.request
import os

ukscP_folder = "./UKSC_PressSum"
if not os.path.exists(ukscP_folder): #check if "UKSC_PressSum folder exists"
    os.makedirs(ukscP_folder) #if it does not, create a new folder called titled Dataset_UKSC

for i in pressUrls:
    filename = i.split("/")[-1] #[-1 indicates that the last portion divided by "/" is to be used to name]
    filepath = os.path.join(ukscP_folder, filename)
    urllib.request.urlretrieve(i, filepath)
    time.sleep(3)

#https://www.tutorialspoint.com/downloading-files-from-web-using-python
#used to identify how to extract filename from link

#https://docs.python.org/3/library/urllib.request.html

### Data Preparation
As files are in pdf format, the files need to be converted to .txt.

In [20]:
#importing required libraries
import os
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text

In [21]:
#converting the United Kingdom Supreme Court judgments (in pdf) to .txt format

pdf_folder = "./Updated_Dataset_UKSC/" #the pdfs are in a folder titled "Updated_Dataset_UKSC"
txt_folder = "./Updated_Dataset_UKSC_txt/"
if not os.path.exists(txt_folder): #check if "Updated_Dataset_UKSC folder exists"
    os.makedirs(txt_folder) #if it does not, create a new folder called titled Updated_Dataset_UKSC

#iterate over all the PDF files in the directory
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        #open PDF file in read mode
        pdf_file = open(os.path.join(pdf_folder, filename), 'rb')

        #create a PDF reader object
        pdf_reader = PdfReader(pdf_file)

        #get the number of pages in the PDF file
        num_pages = len(pdf_reader.pages)

        #create a text file for each PDF file
        txt_file_path = os.path.join(txt_folder, '{}.txt'.format(filename[:-4])) #filename is the same (.pdf deleted and .txt added)
        txt_file = open(txt_file_path, 'w', encoding="utf-8")
        
        #write the text content of each page to the text file
        for page in range(num_pages):
            page_text = pdf_reader.pages[page].extract_text()
       
            #write the text content to the text file
            txt_file.write(page_text)

        #close the text file and PDF file
        txt_file.close()
        pdf_file.close()

In [None]:
#use PDFMiner to convert texts to .txt files
def text_process(pdf_folder, txt_folder):
    text = extract_text(pdf_folder)
    
    with open(txt_folder, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)

if __name__ == "__main__":
    pdf_folder = "Updated_Dataset_UKSC"
    txt_folder = "new UKSC PDFMiner"
    
    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)
        
     #iterate through all PDFs in the input folder
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            input_pdf_path = os.path.join(pdf_folder, filename)
            output_txt_path = os.path.join(txt_folder, os.path.splitext(filename)[0] + '.txt')
            text_process(pdf_folder, output_txt_path)

UKSC Press Summary Dataset: Text Extraction using PDFMiner

In [15]:
if __name__ == "__main__":
    pdf_folder = "./UKSC_PressSum/"
    txt_folder = "./Dataset_UKSC-PressSum_PDFMiner/"
    
    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)
        
    text_process(pdf_folder, txt_folder)
    
#uksc-2013-0030 was removed - PDF corrupted