# Load Unannotated data from Json files

In [1]:
# Import required libraries
import pandas as pd
import datetime
from pandas import json_normalize 
import json
import ast

# Read in JSON file and set index to 'id'
df=pd.read_json('cal_data.jsonl', lines=True).set_index('id')

# Convert dataframe to appropriate data types
df=df.convert_dtypes()

# Convert 'decision_date' column to datetime and extract date only
df['decision_date'] = df['decision_date'].apply(pd.to_datetime).dt.date

# Extract nested 'casebody' data and merge with original dataframe
xx= df["casebody"].apply(pd.Series)
xx1 = xx['data'].apply(pd.Series)
df = pd.merge(df,xx,on=['id'])
df = pd.merge(df,xx1,on=['id'])

# Extract 'cite' value from 'citations' column and create new column 'citation1'
df['citation1'] = df['citations'].apply(pd.Series)[0].apply(pd.Series)['cite']

# Set 'main_id' column to be the index of the dataframe
df['main_id'] = df.index


In [4]:
df[["main_id",'citation1','opinions']].to_excel("full-unlabelled.xlsx")

# Load annotated data from Json files

In [None]:

# Import required libraries
import pandas as pd
import datetime
from pandas import json_normalize 
import json
import ast
# Suppress output from code cell


# Specify names of JSON files to load data from
files=['ca1.json','ca2.json','ca3.json','ca4.json','ca5.json']

# Create empty dataframe to store concatenated data
x = pd.DataFrame()

# Loop through each file, load data from it, and concatenate with 'x' dataframe
for file in files:
    with open(file) as data_file:    
        data = json.load(data_file)
        print(file,len(data))
    y = json_normalize(data)
    # print(file,y.shape)
    x = pd.concat([x,y])

# Drop duplicate rows based on specific columns and drop the 'index' column
x=x.drop_duplicates(subset=['CaseSummary.currentName','CaseSummary.currentLevel','citation'])
x=x.drop(['index'],axis=1)

# Reset the 'index' column to be sequential
x.reset_index(inplace=True)
x['index']=x.index

# Select only rows where 'History.priorHistoryCount' column is >= 1
df_anno_parent=x[x['History.priorHistoryCount'] >= 1]

# Initialize an empty dataframe to store processed historical data
df_anno_history = pd.DataFrame()

# Loop through each row in 'df_anno_parent' dataframe and extract historical data
for l in df_anno_parent.iloc:
    # Extract index of current row
    savel = l['index']
    # Extract historical data from 'History.priorHistoryCases' column and store in new dataframe
    new = pd.DataFrame(ast.literal_eval(l['History.priorHistoryCases']))
    # Rename columns in new dataframe with prefix
    new = new.add_prefix('History.priorHistoryCases.')
    # Add current row's index to new dataframe
    new['index']=l['index']
    # Concatenate new dataframe with 'df_anno_history'
    df_anno_history = pd.concat([df_anno_history,new])

# Reset 'index' column to be sequential
df_anno_history.reset_index(inplace=True)
df_anno_history.drop(["level_0"],axis=1,inplace=True)

# Split 'docSummary' column into separate columns and concatenate with 'df_anno_history'
df_anno_history=pd.concat([df_anno_history,df_anno_history['History.priorHistoryCases.docSummary'].apply(pd.Series).add_prefix('History.priorHistoryCases.docSummary.')],axis=1)


In [313]:
# Define a function to remove values containing the substring 'LEXIS' from a comma-separated string
def remove_lexis(x):
    # Split the string into a list of values
    z = x.split(",")
    # Create a new list to hold cleaned values
    znew = []
    # Loop over each value in the original list
    for zz in z:
        # Check if the value contains 'LEXIS'
        # if zz.find('LEXIS') == -1:
        #     # If it doesn't, add it to the cleaned list
        znew.append(zz)
    # Return the cleaned list of values
    return znew


# Remove rows in 'df_anno_history' where the 'parallels' column is null
df_anno_history = df_anno_history[df_anno_history['History.priorHistoryCases.docSummary.parallels'].notna()]

# Apply the 'remove_lexis' function to the 'parallels' column to create a new column with cleaned values
df_anno_history['parallels_cite'] = df_anno_history['History.priorHistoryCases.docSummary.parallels'].apply(remove_lexis)

# Convert the dataframe to a more efficient datatype
df_anno_history = df_anno_history.convert_dtypes()

# Remove rows where the 'parallels' column is an empty list
df_anno_history = df_anno_history[df_anno_history['History.priorHistoryCases.docSummary.parallels'] != "[]"]

# Add a prefix to the column names in 'df_anno_parent'
df_anno_parent = df_anno_parent.add_prefix("Anno_parent_")


In [314]:
def extract_para(x):
    l = []
    for zz in x:
#        if zz['parallelLabel'].find('LEXIS') == -1:
        l.append(zz['parallelLabel'])
    return l
df_anno_parent['Anno_parent_parellels_cite']=df_anno_parent['Anno_parent_CaseSummary.parallelIDs'].apply(extract_para)
df_anno_parent['num_par']=df_anno_parent['Anno_parent_parellels_cite'].apply(lambda x: len(x))
# df_anno_parent.drop(['Anno_parent_shepID','Anno_parent_itemID','Anno_parent_CaseSummary.shepardsIdentifiers','Anno_parent_CaseSummary.ruriLink',
#'Anno_parent_CaseSummary.docFullPath','Anno_parent_CaseSummary.componentID','Anno_parent_CaseSummary.parallelIDs','Anno_parent_CaseSummary.parallelIDsDef',
#'Anno_parent_CaseSummary.toplineCategoryCode','Anno_parent_CaseSummary.currentCourt','Anno_parent_CaseSummary.currentJurisdiction','Anno_parent_CaseSummary.courtPath.6',
#'Anno_parent_CaseSummary.currentLevel','Anno_parent_History.historyTotal','Anno_parent_History.historyTotalDef','Anno_parent_History.citingDecisions',
#'Anno_parent_History.citationDecisionsDef','Anno_parent_History.otherCitingSources','Anno_parent_History.citingDecisionsDef','Anno_parent_History.toaCount',
#'Anno_parent_History.toaCountDef','Anno_parent_History.historySummary','Anno_parent_History.historySummaryDef','Anno_parent_History.overrulingRiskCount',
#'Anno_parent_History.subsequentAppeals','Anno_parent_History.priorHistory','Anno_parent_History.priorHistoryCases','Anno_parent_CaseSummary.courtPath.5',
#'Anno_parent_CaseSummary.courtPath.1','Anno_parent_CaseSummary.courtPath.2','Anno_parent_CaseSummary.courtPath.3','Anno_parent_CaseSummary.courtPath.4',
#'Anno_parent_CaseSummary.courtPath.7','Anno_parent_CaseSummary.courtPath.8'],axis=1,inplace=True)

df_anno_parent['Anno_parent_citation1'] = df_anno_parent[df_anno_parent['Anno_parent_citation'].str.find('LEXIS') != -1]['Anno_parent_parellels_cite'].apply((lambda x: x[0] if len(x)>0 else None))

In [315]:
df_uni = pd.DataFrame()
uni = []
uni_set = []
cites = df['citation1'].unique()
for ind in list(df_anno_history['index'].unique()):
    df_ind = df_anno_history[df_anno_history['index']==ind]
    df_ind_parent = df_anno_parent[df_anno_parent['Anno_parent_index']==ind]
    newset = set()
    for ll in df_ind['parallels_cite'].iloc:
        for lll in ll:
            if lll.lstrip() in cites:
                newset.add(lll.lstrip())
    for ll in df_ind_parent['Anno_parent_parellels_cite'].iloc:
        newset.add(lll)
    uni_set.append([ind,list(newset)])
    for see in list(newset):
        uni.append([ind,see])

In [317]:
df_uni = pd.DataFrame(uni,columns = ["index","uniq_history"])

df_uni_list = pd.DataFrame(uni_set,columns = ["index","uniq_history_list"])

df_anno_parent = pd.merge(df_anno_parent,df_uni_list,left_on=["Anno_parent_index"],right_on=["index"])

unanno_existing_cases = pd.merge(df_uni,df,left_on=["uniq_history"],right_on=["citation1"])

# df_anno_parent_main_id = pd.merge(df_anno_parent,df,left_on=['Anno_parent_parellels_cite'],right_on=["citation1"])

# df_anno_parent_main_id.to_excel("df_anno_parent_with_main.xlsx")

# df_anno_parent.to_excel("df_anno_parent.xlsx")

# df_anno_parent.columns

# df_anno_parent

In [335]:
anno_existing_cases = pd.merge(df_uni,df_anno_parent.add_prefix('child_'),left_on=["uniq_history"],right_on=['child_Anno_parent_citation'])

In [336]:
both_side = pd.merge(anno_existing_cases,df_anno_parent,left_on=["index"],right_on=['Anno_parent_index'])

In [337]:
both_side.to_excel("both_side.xlsx")

In [293]:
df_anno_history.to_excel('history.xlsx')

df_anno_parent.to_excel('parent.xlsx')

unanno_existing_cases.to_excel('unanno.xlsx')