In [None]:
import os
import json
from tqdm import tqdm
import pandas as pd
import math
import numpy as np
from collections import defaultdict

import pickle
from collections import Counter

In [None]:
## nodes: opinion, opinion_cluster, court, docket
## opinion features: plain_text
## opinion_cluster features: syllabus, judges, case_name, attorneys
## court: full_name, start_date, end_date, citation_string
## docket: pacer_case_id, case_name_full

## (parenthetical): text

In [None]:
raw_data_dir="/home/ubuntu/quic-efs/user/bowenjin/llm-graph-plugin/data/raw_data/legal"
save_dir="/home/ubuntu/quic-efs/user/bowenjin/llm-graph-plugin/data/processed_data/legal"

In [None]:
## read raw data files

opinion_cluster_raw_data = pd.read_csv(os.path.join(raw_data_dir, 'opinion-clusters-2023-08-31.csv'))
court_raw_data = pd.read_csv(os.path.join(raw_data_dir, 'courts-2023-08-31.csv'))
citation_raw_data = pd.read_csv(os.path.join(raw_data_dir, 'citation-map-2023-08-31.csv'))
parentheticals_raw_data = pd.read_csv(os.path.join(raw_data_dir, 'parentheticals-2023-08-31.csv'))
dockets_raw_data = pd.read_csv(os.path.join(raw_data_dir, 'dockets-2023-08-31.csv'))
opinion_raw_data = pd.read_csv(os.path.join(raw_data_dir, 'opinions-2023-08-31.csv'))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
print(len(opinion_cluster_raw_data), len(dockets_raw_data), len(citation_raw_data), len(parentheticals_raw_data), len(opinion_raw_data))

In [None]:
opinion_raw_data.head(n=5)

In [None]:
opinion_cluster_raw_data.head(n=5)

In [None]:
court_raw_data.head(n=5)

In [None]:
dockets_raw_data.head(n=5)

In [None]:
parentheticals_raw_data.head(n=5)

In [None]:
citation_raw_data.head(n=5)

In [None]:
opinion_raw_data.head(n=5)

In [None]:
opinion_raw_data.loc[opinion_raw_data['id'] == 4539950]['html'].values[0]

In [None]:
# plain_text, html, html_lawbox, html_columbia, html_anon_2020, xml_harvard, html_with_citations
all_set = set()
for idd, row in tqdm(opinion_raw_data.iterrows()):
    values = []
    values.append(isinstance(row['plain_text'], str))
    values.append(isinstance(row['html'], str))
    values.append(isinstance(row['html_lawbox'], str))
    values.append(isinstance(row['html_columbia'], str))
    values.append(isinstance(row['html_anon_2020'], str))
    values.append(isinstance(row['xml_harvard'], str))
    values.append(isinstance(row['html_with_citations'], str))
    all_set.add(tuple(values))
print(all_set)

In [None]:
for idd, row in tqdm(opinion_raw_data.iterrows()):
    values = []
    values.append(isinstance(row['plain_text'], str))
    values.append(isinstance(row['html'], str))
    values.append(isinstance(row['html_lawbox'], str))
    values.append(isinstance(row['html_columbia'], str))
    values.append(isinstance(row['html_anon_2020'], str))
    values.append(isinstance(row['xml_harvard'], str))
    values.append(isinstance(row['html_with_citations'], str))
    if isinstance(row['html_columbia'], str) and isinstance(row['html_with_citations'], str):
        print(values)
        print(row['html_columbia'])
        raise ValueError('stop')

In [None]:
for t in all_set:
    if not t[0]:
        print(t)

In [None]:
math.isnan(row['plain_text'])

In [None]:
opinion_cluster_raw_data.head(n=5)

In [None]:
dockets_raw_data.head(n=5)

In [None]:
court_raw_data.head(n=5)

In [None]:
opinion_raw_data.head(n=5)

In [None]:
opinion_raw_data.loc[opinion_raw_data['id'] == 438723]

In [None]:
citation_raw_data.head(n=5)

In [None]:
max(citation_raw_data['depth'].tolist())

In [None]:
## construct book node dictionary
## opinion features: date_created, date_modified, text (plain_text, html, html_lawbox, html_columbia, html_anon_2020, xml_harvard, html_with_citations)
## opinion edges: citing opinions, cited opinions, opinion cluster

## opinion_cluster features: date_created, date_modified, judges, date_filed, slug, case_name_short, case_name, case_name_full, scdb_id, scdb_decision_direction, scdb_votes_majority, scdb_votes_minority, attorneys, syllabus, headnotes, summary
## opinion_cluster edges: opinions, docket

## docket features: date_created, date_modified, date_last_index, date_filed, date_last_filing, case_name_short	case_name, case_name_full, slug, docket_number, docket_number_core, pacer_case_id, ia_date_first_change, date_blocked
## docket edges: opinion cluster, court

## court: date_modified, position, citation_string, short_name, full_name, url, start_date, end_date, jurisdiction, notes
## court edges: docket


opinion_nodes = {}
opinion_cluster_nodes = {}
docket_nodes = {}
court_nodes = {}

opinion_cluster2opinions = defaultdict(list)
docket2opinion_clusters = defaultdict(list)
court2docket = defaultdict(list)

# opinions nodes
for idd, opinion_row in tqdm(opinion_raw_data.iterrows()):

    opinion_nodes[str(opinion_row['id'])] = {'features': {}, 'neighbors': {}}
    ## add features
    #opinion_nodes[str(opinion_row['id'])]['features']['date_created'] = opinion_row['date_created']
    #opinion_nodes[str(opinion_row['id'])]['features']['date_modified'] = opinion_row['date_modified']
    #opinion_nodes[str(opinion_row['id'])]['features']['xml_harvard'] = opinion_row['xml_harvard'] if isinstance(opinion_row['xml_harvard'], str) else ''
    
    if isinstance(opinion_row['plain_text'], str):
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = opinion_row['plain_text']
    elif isinstance(opinion_row['html'], str):
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = opinion_row['html']
    elif isinstance(opinion_row['html_lawbox'], str):
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = opinion_row['html_lawbox']
    elif isinstance(opinion_row['html_columbia'], str):
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = opinion_row['html_columbia']
    elif isinstance(opinion_row['html_anon_2020'], str):
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = opinion_row['html_anon_2020']
    elif isinstance(opinion_row['html_with_citations'], str):
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = opinion_row['html_with_citations']
    else:
        opinion_nodes[str(opinion_row['id'])]['features']['plain_text'] = ''

    ## add neighbors
    opinion_nodes[str(opinion_row['id'])]['neighbors']['opinion_cluster'] = [str(opinion_row['cluster_id'])]
    opinion_nodes[str(opinion_row['id'])]['neighbors']['reference'] = defaultdict(list) # key is number of citation, value is opinion id
    opinion_nodes[str(opinion_row['id'])]['neighbors']['cited_by'] = defaultdict(list)
    opinion_cluster2opinions[str(opinion_row['cluster_id'])].append(str(opinion_row['id']))

miss_opinion_in_citation = 0
## add neighbors
for idd, citation_row in tqdm(citation_raw_data.iterrows()):
    ## add cited_by
    if str(citation_row['cited_opinion_id']) not in opinion_nodes or str(citation_row['citing_opinion_id']) not in opinion_nodes:
        miss_opinion_in_citation += 1
        continue
    opinion_nodes[str(citation_row['cited_opinion_id'])]['neighbors']['cited_by'][str(citation_row['depth'])].append(str(citation_row['citing_opinion_id']))
    opinion_nodes[str(citation_row['citing_opinion_id'])]['neighbors']['reference'][str(citation_row['depth'])].append(str(citation_row['cited_opinion_id']))
print(f'Miss citation cnt:{miss_opinion_in_citation}')

# opinion cluster nodes
for idd, opinion_cluster_row in tqdm(opinion_cluster_raw_data.iterrows()):
    opinion_cluster_nodes[str(opinion_cluster_row['id'])] = {'features': {}, 'neighbors': {}}
    ## add features
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['date_created'] = opinion_cluster_row['date_created']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['date_modified'] = opinion_cluster_row['date_modified']
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['judges'] = opinion_cluster_row['judges']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['date_filed'] = opinion_cluster_row['date_filed']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['slug'] = opinion_cluster_row['slug']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['case_name_short'] = opinion_cluster_row['case_name_short']
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['case_name'] = opinion_cluster_row['case_name']
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['case_name_full'] = opinion_cluster_row['case_name_full']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['scdb_id'] = opinion_cluster_row['scdb_id']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['scdb_decision_direction'] = opinion_cluster_row['scdb_decision_direction']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['scdb_votes_majority'] = opinion_cluster_row['scdb_votes_majority']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['scdb_votes_minority'] = opinion_cluster_row['scdb_votes_minority']
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['attorneys'] = opinion_cluster_row['attorneys']
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['syllabus'] = opinion_cluster_row['syllabus']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['headnotes'] = opinion_cluster_row['headnotes']
    #opinion_cluster_nodes[str(opinion_cluster_row['id'])]['features']['summary'] = opinion_cluster_row['summary']
    ## add neighbors
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['neighbors']['opinion'] = opinion_cluster2opinions[str(opinion_cluster_row['id'])] if str(opinion_cluster_row['id']) in opinion_cluster2opinions else []
    opinion_cluster_nodes[str(opinion_cluster_row['id'])]['neighbors']['docket'] = [str(opinion_cluster_row['docket_id'])]
    docket2opinion_clusters[str(opinion_cluster_row['docket_id'])].append(str(opinion_cluster_row['id']))

# add docket nodes
for idd, docket_row in tqdm(dockets_raw_data.iterrows()):
    docket_nodes[str(docket_row['id'])] = {'features': {}, 'neighbors': {}}
    ## add features
    #docket_nodes[str(docket_row['id'])]['features']['date_created'] = docket_row['date_created']
    #docket_nodes[str(docket_row['id'])]['features']['date_modified'] = docket_row['date_modified']
    #docket_nodes[str(docket_row['id'])]['features']['date_last_index'] = docket_row['date_last_index']
    #docket_nodes[str(docket_row['id'])]['features']['date_filed'] = docket_row['date_filed']
    #docket_nodes[str(docket_row['id'])]['features']['date_last_filing'] = docket_row['date_last_filing']
    #docket_nodes[str(docket_row['id'])]['features']['case_name_short'] = docket_row['case_name_short']
    #docket_nodes[str(docket_row['id'])]['features']['case_name'] = docket_row['case_name']
    docket_nodes[str(docket_row['id'])]['features']['case_name_full'] = docket_row['case_name_full']
    #docket_nodes[str(docket_row['id'])]['features']['slug'] = docket_row['slug']
    #docket_nodes[str(docket_row['id'])]['features']['docket_number'] = docket_row['docket_number']
    #docket_nodes[str(docket_row['id'])]['features']['docket_number_core'] = docket_row['docket_number_core']
    docket_nodes[str(docket_row['id'])]['features']['pacer_case_id'] = docket_row['pacer_case_id']
    #docket_nodes[str(docket_row['id'])]['features']['ia_date_first_change'] = docket_row['ia_date_first_change']
    #docket_nodes[str(docket_row['id'])]['features']['date_blocked'] = docket_row['date_blocked']
    ## add neighbors
    docket_nodes[str(docket_row['id'])]['neighbors']['opinion_cluster'] = docket2opinion_clusters[str(docket_row['id'])] if str(docket_row['id']) in docket2opinion_clusters else []
    docket_nodes[str(docket_row['id'])]['neighbors']['court'] = [str(docket_row['court_id'])]
    court2docket[str(docket_row['court_id'])].append(str(docket_row['id']))


# add court nodes
for idd, court_row in tqdm(court_raw_data.iterrows()):
    court_nodes[str(court_row['id'])] = {'features': {}, 'neighbors': {}}
    ## add features
    #court_nodes[str(court_row['id'])]['features']['date_modified'] = court_row['date_modified']
    #court_nodes[str(court_row['id'])]['features']['position'] = court_row['position']
    court_nodes[str(court_row['id'])]['features']['citation_string'] = court_row['citation_string']
    #court_nodes[str(court_row['id'])]['features']['short_name'] = court_row['short_name']
    court_nodes[str(court_row['id'])]['features']['full_name'] = court_row['full_name']
    #court_nodes[str(court_row['id'])]['features']['url'] = court_row['url']
    court_nodes[str(court_row['id'])]['features']['start_date'] = court_row['start_date']
    court_nodes[str(court_row['id'])]['features']['end_date'] = court_row['end_date']
    #court_nodes[str(court_row['id'])]['features']['jurisdiction'] = court_row['jurisdiction']
    #court_nodes[str(court_row['id'])]['features']['notes'] = court_row['notes']
    ## add neighbours
    court_nodes[str(court_row['id'])]['neighbors']['docket'] = court2docket[str(court_row['id'])] if str(court_row['id']) in court2docket else []

In [None]:
## save graph
#pickle.dump({
#    'opinion_nodes': opinion_nodes,
#    'opinion_cluster_nodes': opinion_cluster_nodes,
#    'docket_nodes': docket_nodes,
#    'court_nodes': court_nodes
#}, open(os.path.join(save_dir, 'graph.pkl'),"wb"))

json.dump({
    'opinion_nodes': opinion_nodes,
    'opinion_cluster_nodes': opinion_cluster_nodes,
    'docket_nodes': docket_nodes,
    'court_nodes': court_nodes
}, open(os.path.join(save_dir, 'graph_raw.json'),"w"), indent = 4)

In [None]:
list(opinion_nodes.keys())[:5]

In [None]:
opinion_nodes['7344188']