<a href="https://colab.research.google.com/github/Paymanshus/Amicus/blob/main/Notebooks/Amicus_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import re
from pprint import pprint
import string

In [3]:
df = pd.read_csv("/content/MyDrive/MyDrive/Amicus/BulkDataCleaned.csv")
# cdf = pd.read_csv("/content/MyDrive/MyDrive/Amicus/dummy.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,PetitionerCounsel,RespondentCounsel,Judge,FinalJudgement,DateFiled,Plaintiff,Defendant,CaseFile
0,0,,No appearance No appearance,"GRIMES, STRATTON, BIGELOW",dismissed,2020-09-21,people,medina,Filed 9/21/20 P. v. Medina CA2/8\n NOT TO BE...
1,1,,,"Finlayson and Craig concurred, Shaw Lawlor Wi...",dismissed,2016-07-05,harper reynolds co,hammond l,This is an appeal from a judgment for the fore...
2,2,,,Lennon and Murphey pro tem concurred,dismissed,2016-07-05,people,,"Defendant, a woman, was convicted under an inf..."
3,3,,,Conrey and Curtis concurred,dismissed,2016-07-05,findley,industrial acc,It appears that in a certain proceeding brough...
4,4,Counsel s,Counsel Law Offices of Stephen M Fuerch...,"REARDON, RIVERA, RUVOLO, Hon Laurel S Brady",dismissed,2015-03-16,serafin,balco properties ltd,Filed 3/16/15\n CER...


In [5]:
df[df['PetitionerCounsel'].notnull()]

Unnamed: 0.1,Unnamed: 0,PetitionerCounsel,RespondentCounsel,Judge,FinalJudgement,DateFiled,Plaintiff,Defendant,CaseFile
0,0,,No appearance No appearance,"GRIMES, STRATTON, BIGELOW",dismissed,2020-09-21,people,medina,Filed 9/21/20 P. v. Medina CA2/8\n NOT TO BE...
4,4,Counsel s,Counsel Law Offices of Stephen M Fuerch...,"REARDON, RIVERA, RUVOLO, Hon Laurel S Brady",dismissed,2015-03-16,serafin,balco properties ltd,Filed 3/16/15\n CER...
9,9,Counsel Myers Nave Richard Delmend...,Counsel Rogers Joseph O’Donnell Robe...,"Rivera, Streeter, Honorable Evelio M Grillo",dismissed,2018-01-12,heron bay homeowners assn,city of san,Filed 1/12/18\n\n C...
10,10,Counsel O’Melveny Myers LLP Michael Fre...,Counsel The Mogin Law Firm Daniel J Mogi...,"Margulies, Banke, Honorable Curtis E A Karnow",dismissed,2015-11-11,roos,honeywell,Filed 11/10/15\n CER...
12,12,Tennant,Escalante,"HOCH, MURRAY, ROBIE",,2020-08-17,felisilda,fca us,Filed 7/24/20; Certified for Publication 8/14/...
...,...,...,...,...,...,...,...,...,...
22750,22750,Attorneys General,,"STRATTON, GRIMES, BIGELOW",dismissed,2020-10-14,people,hunter,Filed 10/14/20 P. v. Hunter CA2/8\n NOT TO B...
22752,22752,appointment by the Court of,Deputy County Counsel Deputy County Counsel,"COLLINS, MICON, MANELLA COLL...",allowed,2018-12-14,,,Filed 11/21/18; Certified for Publication 12/1...
22757,22757,General s s,Foundation Thomas A Myers Samantha Azulay s s,"SEGAL, BECKLOFF, ZELON",dismissed,2015-10-13,aids healthcare foundation,cal dept of,Filed 10/13/15 AIDS Healthcare Foundation v. C...
22759,22759,Kathryn M Casey Imperial County Air Pollut...,,"BUTZ, MURRAY, ROBIE",dismissed,2015-05-26,,,Filed 5/26/15\n CERT...


### Bulk Data Processing

In [None]:
# df.dropna() Cannot run first since Judgement has NaN value

In [None]:
df['CaseFile'] = df['CaseFile'].apply(lambda x: re.sub('\n', '', x))

In [None]:
df['DateFiled'] = pd.to_datetime(df['DateFiled'], format=None)
df.dtypes

Unnamed: 0                    int64
PetitionerCounsel            object
RespondentCounsel            object
Judge                        object
FinalJudgement               object
DateFiled            datetime64[ns]
Plaintiff                    object
Defendant                    object
CaseFile                     object
dtype: object

In [None]:
def date_time_extractor(df, date_col, date_format=None, year=1, quarter=0, month=1, weekofyear=0, dayofweek=0, dayofyear=0, daysinmonth=0, timestamp=0):

    df['TimeStamp'] = pd.to_datetime(df[date_col], format=date_format)
    print(df.head())

    if year:
        df['Year'] = df['TimeStamp'].dt.year
    if quarter:
        df['Quarter'] = df['TimeStamp'].dt.quarter
    if month:
        df['Month'] = df['TimeStamp'].dt.month
    if weekofyear:
        df['WeekOfYear'] = df['TimeStamp'].dt.weekofyear
    if dayofweek:
        df['DayOfWeek'] = df['TimeStamp'].dt.dayofweek
    if dayofyear:
        df['DayOfYear'] = df['TimeStamp'].dt.dayofyear
    if daysinmonth:
        df['DaysInMonth'] = df['TimeStamp'].dt.daysinmonth

    # if (timestamp == 0):
    # df.drop(['TimeStamp'], inplace=True)

    return df

In [None]:
df = date_time_extractor(df, 'DateFiled')
df.columns

   Unnamed: 0  ...  TimeStamp
0           0  ... 2020-09-21
1           1  ... 2016-07-05
2           3  ... 2016-07-05
3           4  ... 2015-03-16
4           6  ... 2016-07-05

[5 rows x 10 columns]


Index(['Unnamed: 0', 'PetitionerCounsel', 'RespondentCounsel', 'Judge',
       'FinalJudgement', 'DateFiled', 'Plaintiff', 'Defendant', 'CaseFile',
       'TimeStamp', 'Year', 'Month'],
      dtype='object')

### Judgement Processing (Donut Chart)

In [None]:
jd_count = pd.DataFrame(df['FinalJudgement'].value_counts())
jd_unique = df['FinalJudgement'].unique()
jd_unique 

array(['dismissed', 'tied / unclear', 'allowed'], dtype=object)

In [None]:
jd_count.reset_index(inplace=True)
jd_count.columns = ['Judgement', 'No. of Cases']

In [None]:
appellant_df = pd.DataFrame(df.loc[:, ['Plaintiff']])
respondent_df = pd.DataFrame(df.loc[:, ['Defendant']])

app_counsel_df = pd.DataFrame(df.loc[:, ['PetitionerCounsel']])
resp_counsel_df = pd.DataFrame(df.loc[:, ['RespondentCounsel']])

In [None]:
# appellant_df

### Chart Processing and Recreations

In [None]:
[{'label': i, 'value': i}
                     for i in jd_unique]

[{'label': 'dismissed', 'value': 'dismissed'},
 {'label': 'tied / unclear', 'value': 'tied / unclear'},
 {'label': 'allowed', 'value': 'allowed'}]

In [None]:
jd_unique.tolist() # Issue of judgement dropdown not appearing due to presence of NaN value

['dismissed', 'tied / unclear', 'allowed']

In [None]:
jd_count.loc[jd_count['Judgement'].isin(dropdown_value)]

Unnamed: 0,Judgement,No. of Cases
0,dismissed,9655
1,allowed,780
2,tied / unclear,480


In [None]:
judgement_df = df[(df['Year'] >= range_slider_value[0])
                      & (df['Year'] <= range_slider_value[1])]

In [None]:
judgement_df = judgement_df.loc[judgement_df['FinalJudgement'].isin(dropdown_value)]
jd_count = pd.DataFrame(judgement_df['FinalJudgement'].value_counts())
# jd_count = jd_count.loc[jd_count['Judgement'].isin(dropdown_value)]
jd_count

Unnamed: 0,FinalJudgement
dismissed,1135
allowed,204
tied / unclear,183


In [None]:
jd_count.reset_index(inplace=True)
# jd_count.columns = ['Judgement', 'No. of Cases']

In [None]:
jd_count = jd_count.set_index(jd_count['index'])
jd_count

Unnamed: 0_level_0,index,FinalJudgement
index,Unnamed: 1_level_1,Unnamed: 2_level_1
dismissed,dismissed,1135
allowed,allowed,204
tied / unclear,tied / unclear,183


In [None]:
fig_donut = px.pie(data_frame=jd_count, values='FinalJudgement',
                       hover_name='index', hole=0.6, color='index',
                  #  color_discrete_sequence=['#f6511d','#00a6ed','#ffb400']
                   )

fig_donut.update_layout(margin=dict(t=60, b=60, l=60, r=60))
fig_donut.update_traces(showlegend=True, selector=dict(type='pie'))
fig_donut

In [None]:
fig_donut.data

(Pie({
     'customdata': array([['dismissed'],
                          ['allowed'],
                          ['tied / unclear']], dtype=object),
     'domain': {'x': [0.0, 1.0], 'y': [0.0, 1.0]},
     'hole': 0.6,
     'hoverlabel': {'namelength': 0},
     'hovertemplate': '<b>%{hovertext}</b><br><br>index=%{customdata[0]}<br>FinalJudgement=%{value}',
     'hovertext': array(['dismissed', 'allowed', 'tied / unclear'], dtype=object),
     'legendgroup': '',
     'marker': {'colors': ['#636efa', '#EF553B', '#00cc96']},
     'name': '',
     'showlegend': True,
     'values': array([1135,  204,  183])
 }),)

In [None]:
jd_unique

array(['dismissed', 'tied / unclear', 'allowed'], dtype=object)

In [None]:
def customLegend(fig, nameSwap):
    for i, dat in enumerate(fig.data):
        for elem in dat:
            if elem == 'name':
                print(fig.data[i].name)
                fig.data[i].name = nameSwap[fig.data[i].name]
    return(fig)

In [None]:
colors = {'red': '#f6511d', 'blue': '#00a6ed', 'yellow': '#ffb400'}

In [None]:
judgement_df = df[(df['Year'] >= range_slider_value[0])
                      & (df['Year'] <= range_slider_value[1])]

jd_count = pd.DataFrame(judgement_df['FinalJudgement'].value_counts())
jd_count.reset_index(inplace=True)
jd_count.columns = ['Judgement', 'No. of Cases']


jd_count = jd_count.loc[jd_count['Judgement'].isin(dropdown_value)]
jd_unique = judgement_df['FinalJudgement'].unique()

fig_donut = px.pie(data_frame=jd_count, values='No. of Cases',
                    hover_name='Judgement', hole=0.6, color='Judgement',
                                   color_discrete_sequence=[
                    colors['yellow'], colors['blue'], colors['yellow']],
                    color_discrete_map={
                                       jd_unique[0]: colors['red'], jd_unique[1]: colors['blue'], jd_unique[2]: colors['yellow']},
                    )
# Dismissed  Allowed  Tied / Unclear

# fig_donut = customLegend(fig_donut, jd_unique)

fig_donut.update_layout(transition_duration=1000)
fig_donut.update_layout(margin=dict(t=30, b=30, l=10, r=10))

fig_donut.update_traces(showlegend=True, selector=dict(type='pie'))

fig_donut

In [None]:
jd_unique = jd_count.loc[jd_count['Judgement'].isin(dropdown_value)]['Judgement']
jd_unique

0         dismissed
1           allowed
2    tied / unclear
Name: Judgement, dtype: object

In [None]:
judgement_df = df[(df['Year'] >= range_slider_value[0])
                      & (df['Year'] <= range_slider_value[1])]

jd_count = pd.DataFrame(judgement_df['FinalJudgement'].value_counts())

jd_count.reset_index(inplace=True)
jd_count.columns = ['Judgement', 'No. of Cases']

jd_unique = jd_count.loc[jd_count['Judgement'].isin(
    dropdown_value)]['Judgement'].values.tolist()

# TODO: Add Legend
fig_donut = px.pie(data_frame=jd_count.loc[jd_count['Judgement'].isin(dropdown_value)], values='No. of Cases',
                    hover_name='Judgement', hole=0.6, color='Judgement',
                    #                color_discrete_sequence=[
                    # colors['blue'], colors['blue'], colors['yellow']],
                    color_discrete_map={
    jd_unique[0]: colors['red'], jd_unique[1]: colors['blue'], jd_unique[2]: colors['yellow']},

)

# fig_donut = customLegend(fig_donut, jd_unique)
# Dismissed  Allowed  Tied / Unclear
fig_donut.update_layout(transition_duration=1000)
fig_donut.update_layout(margin=dict(t=30, b=30, l=10, r=10))

fig_donut.update_traces(showlegend=True, selector=dict(type='pie'), )
fig_donut

In [None]:
judgement_df = df[(df['Year'] >= range_slider_value[0])
                      & (df['Year'] <= range_slider_value[1])]

judgement_df = judgement_df.loc[judgement_df['FinalJudgement'].isin(
    dropdown_value)]
jd_count = pd.DataFrame(judgement_df['FinalJudgement'].value_counts())

jd_count.reset_index(inplace=True)
jd_count.columns = ['Judgement', 'No. of Cases']

jd_unique = judgement_df.FinalJudgement.unique()

# TODO: Add Legend
fig_donut = px.pie(data_frame=jd_count.loc[jd_count['Judgement'].isin(dropdown_value)], values='No. of Cases',
                    hover_name='Judgement', hole=0.6, color='Judgement',
                    #                color_discrete_sequence=[
                    # colors['blue'], colors['blue'], colors['yellow']],
                    color_discrete_map={
    'dismissed': colors['red'], 'allowed': colors['blue'], 'tied / unclear': colors['yellow']},

)

# fig_donut = customLegend(fig_donut, jd_unique)
# Dismissed  Allowed  Tied / Unclear
for i in range(len(jd_unique)):
    print(fig_donut.data[i].name, jd_unique[i])
    # fig_donut.data[i].name = jd_unique[i]

fig_donut.update_layout(transition_duration=1000)
fig_donut.update_layout(margin=dict(t=30, b=30, l=10, r=10))

fig_donut.update_traces(showlegend=True, selector=dict(type='pie'), )

fig_donut.update_traces(textposition='outside',
                        textinfo='percent+label',
                        marker=dict(line=dict(color='#000000',
                                                    width=2)),
                        pull=[0.05, 0, 0.03],
                        opacity=0.9,
                        # rotation=180
                        )

fig_donut

 tied / unclear


IndexError: ignored

In [None]:
fig_donut.update_layout(transition_duration=1000)
fig_donut.update_layout(margin=dict(t=30, b=30, l=10, r=10))

fig_donut.update_traces(showlegend=True, selector=dict(type='pie'))

fig_donut.update_traces(textposition='outside',
                        textinfo='percent+label',
                        marker=dict(line=dict(color='#000000',
                                                    width=2)),
                        pull=[0.05, 0, 0.03],
                        opacity=0.9,
                        # rotation=180
                        )
fig_donut.data[0].name = jd_unique[0]
fig_donut

TypeError: ignored

In [None]:
px.colors.qualitative.Plotly

['#636EFA',
 '#EF553B',
 '#00CC96',
 '#AB63FA',
 '#FFA15A',
 '#19D3F3',
 '#FF6692',
 '#B6E880',
 '#FF97FF',
 '#FECB52']

### Line Graph

In [None]:
area_df = df.loc[df['FinalJudgement'].isin(jd_unique)]

px.area(area_df, x='DateFiled', color="FinalJudgement")

In [None]:
df['DateFiled'] = pd.to_datetime(df['DateFiled'])
df = df.sort_values('DateFiled')
df.DateFiled = df.DateFiled.astype('string')
df['month_year'] = df.DateFiled.apply(lambda x: x.split('-')[1] + '/' + x.split('-')[0])

# fig = px.area(df, x = 'month_year', color = "FinalJudgement", hover_name = 'DateFiled')
# fig.show()

In [None]:
gb_df = df.groupby(['month_year', 'FinalJudgement']).count()
gb_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,PetitionerCounsel,RespondentCounsel,Judge,DateFiled,Plaintiff,Defendant,CaseFile,TimeStamp,Year,Month
month_year,FinalJudgement,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
01/2015,allowed,19,13,13,19,19,19,19,19,19,19,19
01/2015,dismissed,269,159,154,269,269,269,269,269,269,269,269
01/2015,tied / unclear,9,7,9,9,9,9,9,9,9,9,9
01/2016,allowed,19,13,13,19,19,19,19,19,19,19,19
01/2016,dismissed,270,154,162,270,270,270,270,270,270,270,270
...,...,...,...,...,...,...,...,...,...,...,...,...
12/2018,dismissed,9,9,5,9,9,9,9,9,9,9,9
12/2018,tied / unclear,10,8,10,10,10,10,10,10,10,10,10
12/2019,allowed,4,4,3,4,4,4,4,4,4,4,4
12/2019,dismissed,15,11,14,15,15,15,15,15,15,15,15


In [None]:
gb_df.index

MultiIndex([('01/2015',        'allowed'),
            ('01/2015',      'dismissed'),
            ('01/2015', 'tied / unclear'),
            ('01/2016',        'allowed'),
            ('01/2016',      'dismissed'),
            ('01/2016', 'tied / unclear'),
            ('01/2017',        'allowed'),
            ('01/2017',      'dismissed'),
            ('01/2017', 'tied / unclear'),
            ('01/2018',        'allowed'),
            ...
            ('12/2016', 'tied / unclear'),
            ('12/2017',        'allowed'),
            ('12/2017',      'dismissed'),
            ('12/2017', 'tied / unclear'),
            ('12/2018',        'allowed'),
            ('12/2018',      'dismissed'),
            ('12/2018', 'tied / unclear'),
            ('12/2019',        'allowed'),
            ('12/2019',      'dismissed'),
            ('12/2019', 'tied / unclear')],
           names=['month_year', 'FinalJudgement'], length=209)

In [None]:
years = gb_df.unstack().index
years

Index(['01/2015', '01/2016', '01/2017', '01/2018', '01/2019', '01/2020',
       '02/2015', '02/2016', '02/2017', '02/2018', '02/2019', '02/2020',
       '03/2015', '03/2016', '03/2017', '03/2018', '03/2019', '03/2020',
       '04/2015', '04/2016', '04/2017', '04/2018', '04/2019', '04/2020',
       '05/2015', '05/2016', '05/2017', '05/2018', '05/2019', '05/2020',
       '06/2015', '06/2016', '06/2017', '06/2018', '06/2019', '06/2020',
       '07/2015', '07/2016', '07/2017', '07/2018', '07/2019', '07/2020',
       '08/2015', '08/2016', '08/2017', '08/2018', '08/2019', '08/2020',
       '09/2015', '09/2016', '09/2017', '09/2018', '09/2019', '09/2020',
       '10/2015', '10/2016', '10/2017', '10/2018', '10/2019', '10/2020',
       '11/2015', '11/2016', '11/2017', '11/2018', '11/2019', '12/2015',
       '12/2016', '12/2017', '12/2018', '12/2019'],
      dtype='object', name='month_year')

In [None]:
judges = gb_df.unstack().columns
judges

MultiIndex([(       'Unnamed: 0',        'allowed'),
            (       'Unnamed: 0',      'dismissed'),
            (       'Unnamed: 0', 'tied / unclear'),
            ('PetitionerCounsel',        'allowed'),
            ('PetitionerCounsel',      'dismissed'),
            ('PetitionerCounsel', 'tied / unclear'),
            ('RespondentCounsel',        'allowed'),
            ('RespondentCounsel',      'dismissed'),
            ('RespondentCounsel', 'tied / unclear'),
            (            'Judge',        'allowed'),
            (            'Judge',      'dismissed'),
            (            'Judge', 'tied / unclear'),
            (        'DateFiled',        'allowed'),
            (        'DateFiled',      'dismissed'),
            (        'DateFiled', 'tied / unclear'),
            (        'Plaintiff',        'allowed'),
            (        'Plaintiff',      'dismissed'),
            (        'Plaintiff', 'tied / unclear'),
            (        'Defendant',        'allo

In [None]:
gb_df.shape

(209, 11)

In [None]:
years_index = gb_df.index.get_level_values(0)
years_index

Index(['01/2015', '01/2015', '01/2015', '01/2016', '01/2016', '01/2016',
       '01/2017', '01/2017', '01/2017', '01/2018',
       ...
       '12/2016', '12/2017', '12/2017', '12/2017', '12/2018', '12/2018',
       '12/2018', '12/2019', '12/2019', '12/2019'],
      dtype='object', name='month_year', length=209)

In [None]:
judge_index = gb_df.index.get_level_values(1)
judge_index

Index(['allowed', 'dismissed', 'tied / unclear', 'allowed', 'dismissed',
       'tied / unclear', 'allowed', 'dismissed', 'tied / unclear', 'allowed',
       ...
       'tied / unclear', 'allowed', 'dismissed', 'tied / unclear', 'allowed',
       'dismissed', 'tied / unclear', 'allowed', 'dismissed',
       'tied / unclear'],
      dtype='object', name='FinalJudgement', length=209)

### GroupBy starts here


In [None]:
temp = df.groupby(['month_year', 'FinalJudgement']).count()['Judge']
temp

month_year  FinalJudgement
01/2015     allowed            19
            dismissed         269
            tied / unclear      9
01/2016     allowed            19
            dismissed         270
                             ... 
12/2018     dismissed           9
            tied / unclear     10
12/2019     allowed             4
            dismissed          15
            tied / unclear     10
Name: Judge, Length: 209, dtype: int64

In [None]:
temp.shape

(209,)

In [None]:
temp.index

MultiIndex([('01/2015',        'allowed'),
            ('01/2015',      'dismissed'),
            ('01/2015', 'tied / unclear'),
            ('01/2016',        'allowed'),
            ('01/2016',      'dismissed'),
            ('01/2016', 'tied / unclear'),
            ('01/2017',        'allowed'),
            ('01/2017',      'dismissed'),
            ('01/2017', 'tied / unclear'),
            ('01/2018',        'allowed'),
            ...
            ('12/2016', 'tied / unclear'),
            ('12/2017',        'allowed'),
            ('12/2017',      'dismissed'),
            ('12/2017', 'tied / unclear'),
            ('12/2018',        'allowed'),
            ('12/2018',      'dismissed'),
            ('12/2018', 'tied / unclear'),
            ('12/2019',        'allowed'),
            ('12/2019',      'dismissed'),
            ('12/2019', 'tied / unclear')],
           names=['month_year', 'FinalJudgement'], length=209)

In [None]:
final_df = temp.reset_index()
final_df

Unnamed: 0,month_year,FinalJudgement,Judge
0,01/2015,allowed,19
1,01/2015,dismissed,269
2,01/2015,tied / unclear,9
3,01/2016,allowed,19
4,01/2016,dismissed,270
...,...,...,...
204,12/2018,dismissed,9
205,12/2018,tied / unclear,10
206,12/2019,allowed,4
207,12/2019,dismissed,15


In [None]:
final_df.month_year = (pd.to_datetime(final_df.month_year))
final_df.month_year

0     2015-01-01
1     2015-01-01
2     2015-01-01
3     2016-01-01
4     2016-01-01
         ...    
204   2018-12-01
205   2018-12-01
206   2019-12-01
207   2019-12-01
208   2019-12-01
Name: month_year, Length: 209, dtype: datetime64[ns]

In [None]:
final_df = final_df.sort_values(by='month_year')
final_df

Unnamed: 0,month_year,FinalJudgement,Judge
0,2015-01-01,allowed,19
1,2015-01-01,dismissed,269
2,2015-01-01,tied / unclear,9
20,2015-02-01,tied / unclear,9
19,2015-02-01,dismissed,260
...,...,...,...
159,2020-09-01,dismissed,256
160,2020-09-01,tied / unclear,9
176,2020-10-01,allowed,27
177,2020-10-01,dismissed,288


In [None]:
fig = px.area(final_df, x='month_year', y='Judge', color='FinalJudgement')
fig.update_traces(patch=)

In [None]:
years = (temp.index.levels[1].tolist())
# years
# Convert to datetime?

In [None]:
# sorted_years = sorted(pd.to_datetime(years))
# len(sorted_years)

In [None]:
count_df = pd.DataFrame([temp[x] for x in temp.index.levels[0]])
count_df

Unnamed: 0,allowed,dismissed,tied / unclear
Judge,19.0,269.0,9.0
Judge,19.0,270.0,7.0
Judge,1.0,10.0,6.0
Judge,7.0,18.0,4.0
Judge,4.0,14.0,5.0
...,...,...,...
Judge,17.0,291.0,7.0
Judge,5.0,13.0,4.0
Judge,3.0,11.0,3.0
Judge,4.0,9.0,10.0


In [None]:
new_index = temp.index.levels[0]
new_index

Index(['01/2015', '01/2016', '01/2017', '01/2018', '01/2019', '01/2020',
       '02/2015', '02/2016', '02/2017', '02/2018', '02/2019', '02/2020',
       '03/2015', '03/2016', '03/2017', '03/2018', '03/2019', '03/2020',
       '04/2015', '04/2016', '04/2017', '04/2018', '04/2019', '04/2020',
       '05/2015', '05/2016', '05/2017', '05/2018', '05/2019', '05/2020',
       '06/2015', '06/2016', '06/2017', '06/2018', '06/2019', '06/2020',
       '07/2015', '07/2016', '07/2017', '07/2018', '07/2019', '07/2020',
       '08/2015', '08/2016', '08/2017', '08/2018', '08/2019', '08/2020',
       '09/2015', '09/2016', '09/2017', '09/2018', '09/2019', '09/2020',
       '10/2015', '10/2016', '10/2017', '10/2018', '10/2019', '10/2020',
       '11/2015', '11/2016', '11/2017', '11/2018', '11/2019', '12/2015',
       '12/2016', '12/2017', '12/2018', '12/2019'],
      dtype='object', name='month_year')

In [None]:
count_df.set_index(new_index, inplace=True)

In [None]:
count_df

Unnamed: 0_level_0,allowed,dismissed,tied / unclear
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/2015,19.0,269.0,9.0
01/2016,19.0,270.0,7.0
01/2017,1.0,10.0,6.0
01/2018,7.0,18.0,4.0
01/2019,4.0,14.0,5.0
...,...,...,...
12/2015,17.0,291.0,7.0
12/2016,5.0,13.0,4.0
12/2017,3.0,11.0,3.0
12/2018,4.0,9.0,10.0


In [None]:
area_df = count_df.T
area_df

month_year,01/2015,01/2016,01/2017,01/2018,01/2019,01/2020,02/2015,02/2016,02/2017,02/2018,02/2019,02/2020,03/2015,03/2016,03/2017,03/2018,03/2019,03/2020,04/2015,04/2016,04/2017,04/2018,04/2019,04/2020,05/2015,05/2016,05/2017,05/2018,05/2019,05/2020,06/2015,06/2016,06/2017,06/2018,06/2019,06/2020,07/2015,07/2016,07/2017,07/2018,07/2019,07/2020,08/2015,08/2016,08/2017,08/2018,08/2019,08/2020,09/2015,09/2016,09/2017,09/2018,09/2019,09/2020,10/2015,10/2016,10/2017,10/2018,10/2019,10/2020,11/2015,11/2016,11/2017,11/2018,11/2019,12/2015,12/2016,12/2017,12/2018,12/2019
allowed,19.0,19.0,1.0,7.0,4.0,3.0,13.0,19.0,2.0,1.0,4.0,3.0,20.0,18.0,3.0,9.0,7.0,1.0,17.0,11.0,5.0,,1.0,4.0,18.0,18.0,3.0,8.0,2.0,7.0,20.0,16.0,4.0,2.0,7.0,8.0,20.0,171.0,2.0,1.0,3.0,8.0,14.0,21.0,5.0,5.0,5.0,19.0,22.0,11.0,7.0,6.0,3.0,26.0,13.0,6.0,4.0,4.0,5.0,27.0,20.0,6.0,3.0,4.0,2.0,17.0,5.0,3.0,4.0,4.0
dismissed,269.0,270.0,10.0,18.0,14.0,10.0,260.0,272.0,17.0,12.0,9.0,14.0,252.0,329.0,16.0,15.0,15.0,6.0,238.0,282.0,7.0,18.0,9.0,18.0,275.0,300.0,9.0,19.0,14.0,16.0,298.0,306.0,9.0,11.0,10.0,19.0,254.0,2861.0,11.0,11.0,12.0,14.0,212.0,267.0,19.0,21.0,11.0,153.0,258.0,283.0,10.0,16.0,18.0,256.0,266.0,66.0,10.0,12.0,21.0,288.0,237.0,15.0,17.0,17.0,14.0,291.0,13.0,11.0,9.0,15.0
tied / unclear,9.0,7.0,6.0,4.0,5.0,6.0,9.0,4.0,3.0,5.0,2.0,3.0,9.0,6.0,2.0,8.0,5.0,5.0,7.0,13.0,4.0,4.0,4.0,1.0,8.0,5.0,5.0,7.0,2.0,2.0,4.0,10.0,5.0,2.0,5.0,6.0,8.0,88.0,4.0,3.0,4.0,6.0,7.0,5.0,3.0,2.0,6.0,11.0,6.0,7.0,5.0,2.0,5.0,9.0,6.0,3.0,4.0,3.0,7.0,16.0,10.0,1.0,10.0,8.0,5.0,7.0,4.0,3.0,10.0,10.0


In [None]:
# area_df.index = (pd.to_datetime(area_df.index))
# area_df.index

In [None]:
area_df.sort_index(inplace=True)
area_df

month_year,01/2015,01/2016,01/2017,01/2018,01/2019,01/2020,02/2015,02/2016,02/2017,02/2018,02/2019,02/2020,03/2015,03/2016,03/2017,03/2018,03/2019,03/2020,04/2015,04/2016,04/2017,04/2018,04/2019,04/2020,05/2015,05/2016,05/2017,05/2018,05/2019,05/2020,06/2015,06/2016,06/2017,06/2018,06/2019,06/2020,07/2015,07/2016,07/2017,07/2018,07/2019,07/2020,08/2015,08/2016,08/2017,08/2018,08/2019,08/2020,09/2015,09/2016,09/2017,09/2018,09/2019,09/2020,10/2015,10/2016,10/2017,10/2018,10/2019,10/2020,11/2015,11/2016,11/2017,11/2018,11/2019,12/2015,12/2016,12/2017,12/2018,12/2019
allowed,19.0,19.0,1.0,7.0,4.0,3.0,13.0,19.0,2.0,1.0,4.0,3.0,20.0,18.0,3.0,9.0,7.0,1.0,17.0,11.0,5.0,,1.0,4.0,18.0,18.0,3.0,8.0,2.0,7.0,20.0,16.0,4.0,2.0,7.0,8.0,20.0,171.0,2.0,1.0,3.0,8.0,14.0,21.0,5.0,5.0,5.0,19.0,22.0,11.0,7.0,6.0,3.0,26.0,13.0,6.0,4.0,4.0,5.0,27.0,20.0,6.0,3.0,4.0,2.0,17.0,5.0,3.0,4.0,4.0
dismissed,269.0,270.0,10.0,18.0,14.0,10.0,260.0,272.0,17.0,12.0,9.0,14.0,252.0,329.0,16.0,15.0,15.0,6.0,238.0,282.0,7.0,18.0,9.0,18.0,275.0,300.0,9.0,19.0,14.0,16.0,298.0,306.0,9.0,11.0,10.0,19.0,254.0,2861.0,11.0,11.0,12.0,14.0,212.0,267.0,19.0,21.0,11.0,153.0,258.0,283.0,10.0,16.0,18.0,256.0,266.0,66.0,10.0,12.0,21.0,288.0,237.0,15.0,17.0,17.0,14.0,291.0,13.0,11.0,9.0,15.0
tied / unclear,9.0,7.0,6.0,4.0,5.0,6.0,9.0,4.0,3.0,5.0,2.0,3.0,9.0,6.0,2.0,8.0,5.0,5.0,7.0,13.0,4.0,4.0,4.0,1.0,8.0,5.0,5.0,7.0,2.0,2.0,4.0,10.0,5.0,2.0,5.0,6.0,8.0,88.0,4.0,3.0,4.0,6.0,7.0,5.0,3.0,2.0,6.0,11.0,6.0,7.0,5.0,2.0,5.0,9.0,6.0,3.0,4.0,3.0,7.0,16.0,10.0,1.0,10.0,8.0,5.0,7.0,4.0,3.0,10.0,10.0


In [None]:
# fig = px.area(area_df, x=area_df.index, y=area_df.dismissed)
# fig

In [None]:
area_df.iloc[:,0]

allowed            19.0
dismissed         269.0
tied / unclear      9.0
Name: 01/2015, dtype: float64

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=area_df.index, y=area_df.iloc[:,0],
    hoverinfo='x+y',
    mode='lines',
    # line=dict(width=0.5, color='rgb(131, 90, 241)'),
    stackgroup='one' # define stack group
))
fig.add_trace(go.Scatter(
    x=area_df.index, y=area_df.iloc[:,1],
    hoverinfo='x+y',
    mode='lines',
    # line=dict(width=0.5, color='rgb(111, 231, 219)'),
    stackgroup='one'
))
fig.add_trace(go.Scatter(
    x=area_df.index, y=area_df.iloc[:,2],
    hoverinfo='x+y',
    mode='lines',
    # line=dict(width=0.5, color='rgb(184, 247, 212)'),
    stackgroup='one'
))

fig.update_layout(yaxis_range=(0, 100))
fig.show()

### Stacked Bar Graph

In [None]:
dropdown_value = jd_unique
bar_df = df.loc[df['FinalJudgement'].isin(dropdown_value)]

In [None]:
bar_df1 = bar_df.groupby(['RespondentCounsel', 'FinalJudgement']).count()['Judge'].reset_index()
bar_df2 = bar_df1.groupby(['RespondentCounsel']).sum().reset_index()
final = pd.merge(bar_df1, bar_df2, on = ['RespondentCounsel'])
final['Percent'] = (final['Judge_x'] / final['Judge_y']) * 100
final.head()

Unnamed: 0,RespondentCounsel,FinalJudgement,Judge_x,Judge_y,Percent
0,,allowed,9,86,10.465116
1,,dismissed,74,86,86.046512
2,,tied / unclear,3,86,3.488372
3,Urrabazo Law Donald Urrabazo Arturo Padilla ...,allowed,1,1,100.0
4,care diligence’ ’s safe carriage” as a carr...,dismissed,1,1,100.0


In [None]:
fig = px.bar(final.sample(50), y = 'Percent', x = 'RespondentCounsel', color = 'FinalJudgement')
fig.update_xaxes(showticklabels=False)
fig.update_yaxes()
fig

### Filtering Date

In [None]:
df.Year

7491    2015
5827    2015
3438    2015
167     2015
9852    2015
        ... 
6680    2020
3712    2020
5947    2020
6231    2020
8564    2020
Name: Year, Length: 10915, dtype: int64

In [None]:
range_slider_value = [2018,2021]

In [None]:
df[(df['Year']>=range_slider_value[0]) & (df['Year']<=range_slider_value[1])]

Unnamed: 0.1,Unnamed: 0,PetitionerCounsel,RespondentCounsel,Judge,FinalJudgement,DateFiled,Plaintiff,Defendant,CaseFile,TimeStamp,Year,Month,month_year
8241,17180,Daniels s s,Hollins Law Kathleen Mary Kushi Carter Christ...,Plaintiff and appellant Brent Arave brought se...,tied / unclear,2018-01-02,arave,merrill lynch pierce,Filed 1/2/18\n CERTIFIED F...,2018-01-02,2018,1,01/2018
9250,19223,,,EDMON LAVIN BACHN...,dismissed,2018-01-03,gonzalez,city of,Filed 1/3/18 (unmodified opn. attached)\n ...,2018-01-03,2018,1,01/2018
1186,2537,Bañuelos Bañuelos,McDonell,"GRIMES, FLIER, BIGELOW",dismissed,2018-01-05,simers,la times,Filed 1/5/18\n CERTIFIED FOR PUB...,2018-01-05,2018,1,01/2018
4036,8467,Deputy Attorneys General Deputy Attorneys General,Murphy Buchal James L Buchal s s,"Robie, Nicholson, Blease",allowed,2018-01-05,central coast forest assn,fish game,Filed 1/5/18\n CE...,2018-01-05,2018,1,01/2018
10522,21945,,,"HOCH, NICHOLSON ∗, MAURO",tied / unclear,2018-01-08,labor workforce development agency,superior,Filed 1/8/18\n\n ...,2018-01-08,2018,1,01/2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6680,13939,,,"MARGULIES, SANCHEZ, HUMES",dismissed,2020-10-30,people,lafrance,Filed 10/29/20 P. v. LaFrance CA1/1\n ...,2020-10-30,2020,10,10/2020
3712,7811,,,"NEEDHAM, BURNS",dismissed,2020-10-30,people,smith,Filed 10/29/20 P. v. Smith CA1/5\n ...,2020-10-30,2020,10,10/2020
5947,12421,Counsel Jason Heath,Counsel James Patrick Ballantine,"Grover, Danner, Greenwood, Hon John M Gallagher",dismissed,2020-10-30,pinto lake mhp llc,county of santa,Filed 10/30/20\n CER...,2020-10-30,2020,10,10/2020
6231,13035,,,"Miller, Stewart",dismissed,2020-10-30,people,rogers,Filed 10/29/20 P. v. Rogers CA1/2\n ...,2020-10-30,2020,10,10/2020


### Heatmap Graph

In [None]:
dat = df.dropna(subset=['PetitionerCounsel', 'RespondentCounsel'])
dat

Unnamed: 0.1,Unnamed: 0,PetitionerCounsel,RespondentCounsel,Judge,FinalJudgement,DateFiled,Plaintiff,Defendant,CaseFile,TimeStamp,Year,Month,month_year
7491,15635,Counsel Morgan Lewis Bockius LLP,Counsel s s Hoffman Libenson Saunders...,"Dondero, Margulies, Hon Wynne S Carvill",dismissed,2015-01-01,koval,pacific,Filed 12/31/14\n CER...,2015-01-01,2015,1,01/2015
5827,12172,William I Parks,Raymond L Brosterhous II Deputy Attorneys Gene..., Before Levy Poochigian and Detjen,dismissed,2015-01-02,people,lynch,Filed 1/2/15 P. v. Lynch CA5\n\n\n\n\n ...,2015-01-02,2015,1,01/2015
3438,7194,LeClairRyan Peter M Hart,Romano Stancroff Mark Romano,"RUBIN, FLIER, BIGELOW",dismissed,2015-01-02,macquiddy,mercedes benz usa,Filed 1/2/15 MacQuiddy v. Mercedes-Benz USA CA...,2015-01-02,2015,1,01/2015
9852,20539,Susan L Jordan,C Chang Deputy Attorney General C Chang Deputy...,"PERREN, YEGAN, GILBERT",dismissed,2015-01-05,people,hall,Filed 1/5/15 P. v. Hall CA2/6\n ...,2015-01-05,2015,1,01/2015
727,1544,Davood Rahnama in pro per Debbie Renna in pro ...,Madjid Rahnama in pro per Madjid Rahnama in pr...,Before Levy Cornell and Kane,dismissed,2015-01-05,rahnama,rahnama,Filed 1/5/15 Rahnama v. Rahnama CA5\n\n\n\n\n ...,2015-01-05,2015,1,01/2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10853,22636,Michelman Robinson Reuben A Ginsburg s,Dale E Phillips in pro per fact “It is reasona...,"MOORE, ARONSON, O’LEARY",dismissed,2020-10-30,phillips,sugarman,Filed 10/30/20 Phillips v. Sugarman CA4/3\n\n\...,2020-10-30,2020,10,10/2020
4182,8750,Shedlock Edwin Richards Crosscomplainant,Goldhamer s Crossdefendants s equitable cont...,"MOORE, IKOLA, BEDSWORTH",dismissed,2020-10-30,walters,moore,Filed 10/30/20 Walters v. Moore CA4/3\n\n\n\n ...,2020-10-30,2020,10,10/2020
6974,14553,Douglas G Benedon Wendy S Albers Cross,Plattner Kristy A Murphy Crossdefendant,"SINANIAN, CHANEY, BENDIX",dismissed,2020-10-30,us security associates inc,andrews,"Filed 10/30/20 U.S. Security Associates, Inc. ...",2020-10-30,2020,10,10/2020
4181,8744,FSG Lawyers Richard W Millar Jr,Sharon J Clapham in pro per,"CHANEY, SINANIAN",dismissed,2020-10-30,clapham,barker,Filed 10/29/20 Clapham v. Barker CA2/1\nRepost...,2020-10-30,2020,10,10/2020


In [None]:
len(df), len(dat)

(10915, 4131)

In [None]:
dat = dat.groupby(['RespondentCounsel', 'Judge']).count()['DateFiled'].reset_index()

In [None]:
data1 = dat.pivot(index = 'RespondentCounsel', columns = 'Judge')

In [None]:
data1.columns = data1.columns.droplevel(0)

In [None]:
fig = px.imshow(data1, labels = dict(x = "Respondent Counsel", y = "Judge", color = "Productivity"), 
                 x = data1.index.tolist(), y = data1.columns.tolist())
fig.show()

TypeError: ignored

In [None]:
px.colors.qualitative.swatches()

In [None]:
px.colors.qualitative.Set3[3]

### Heatmap

In [None]:
!pip install plotly --upgrade

In [None]:
dat = cdf.groupby(['RespondentCounsel', 'Judge']).count()['FinalJudgement'].reset_index()
data1 = dat.pivot(index = 'RespondentCounsel', columns = 'Judge')
data1.columns = data1.columns.droplevel(0)

fig = px.imshow(data1, labels = dict(x = "Respondent Counsel", y = "Judge", color = "Productivity"), 
                x = data1.index.tolist(), y = data1.columns.tolist())

fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

fig.show()

In [None]:
px.colors.named_colorscales()

In [None]:
### Updated
# Petitioner Counsel against Benches
# Respondent Counsel against Benches

dat = cdf.groupby(['RespondentCounsel', 'Judge']).count()['FinalJudgement'].reset_index()
data1 = dat.pivot(index = 'RespondentCounsel', columns = 'Judge')
data1.columns = data1.columns.droplevel(0)

fig = px.imshow(data1
                ,labels = dict(x = "Respondent Counsel", y = "Judge", color = "Efficacy")
                #,x = data1.index.tolist(), y = data1.columns.tolist()
                ,width=1000, height=1000
                )

fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

fig.show()

# NER

In [6]:
# df['CaseFile'] = df['CaseFile'].apply(lambda x: re.sub('\n', '', x))
preds={}

In [7]:
test_str1 = df['CaseFile'][9]
test_str2 = df['CaseFile'][10]
test_str2

'Filed 11/10/15\n                           CERTIFIED FOR PUBLICATION\n\n             IN THE COURT OF APPEAL OF THE STATE OF CALIFORNIA\n\n                             FIRST APPELLATE DISTRICT\n\n                                      DIVISION ONE\n\n\nJOEL I. ROOS et al.,\n        Plaintiffs and Respondents,\n                                                   A142156\nv.\nHONEYWELL INTERNATIONAL, INC.,                     (San Francisco\n                                                   Super. Ct. No. CGC-04-436205)\n        Defendant and Respondent, and\nART ROGERS et al.,\n        Objectors and Appellants.\n\n\n        Four objectors—Art Rogers, Chuck Congdon, Richard Moser, and Amanda\nWaldenville—appeal from the trial court’s order approving an $8.15 million settlement of\na class action against Honeywell International Inc. and awarding a portion of the\nsettlement as fees to class counsel. The trial court found that the objectors failed to\nestablish they had standing, but it the

### Transformers

In [8]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 7.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 43.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.2MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Installing c

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

models_list = ['TFConvBertForTokenClassification', 'TFDistilBertForTokenClassification', 'TFAlbertForTokenClassification', 'TFCamembertForTokenClassification', 'TFFlaubertForTokenClassification', 'TFXLMForTokenClassification', 'TFXLMRobertaForTokenClassification', 'TFLongformerForTokenClassification', 'TFRobertaForTokenClassification', 'TFLayoutLMForTokenClassification', 'TFBertForTokenClassification', 'TFMobileBertForTokenClassification', 'TFXLNetForTokenClassification', 'TFElectraForTokenClassification', 'TFFunnelForTokenClassification', 'TFMPNetForTokenClassification']

In [10]:
def ner_predict(text, model, tokenizer):
  nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
  ner_results = nlp(text)

  entity_list = [(i['word'], i['entity_group']) for i in ner_results]

  return ner_results, entity_list

In [11]:
def ner_extract(pred):

  entity_list = [(i['word'], i['entity_group']) for i in pred]

  # for i in pred:
  #   entity_list = [(i['word'], i['entity_group'], i['score'])]

  return entity_list

In [12]:
tokenizer_base = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

model_base = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=829.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=59.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433316646.0, style=ProgressStyle(descri…




In [13]:
preds['base-base-NER'], entity_list1 = ner_predict(test_str1, model_base, tokenizer_base)
preds['base-base-NER']

[{'end': 127,
  'entity_group': 'LOC',
  'score': 0.8377326726913452,
  'start': 125,
  'word': 'CA'},
 {'end': 134,
  'entity_group': 'ORG',
  'score': 0.6162852868437767,
  'start': 127,
  'word': '##LIFORNI'},
 {'end': 135,
  'entity_group': 'LOC',
  'score': 0.5811084508895874,
  'start': 134,
  'word': '##A'},
 {'end': 314,
  'entity_group': 'ORG',
  'score': 0.8829874992370605,
  'start': 290,
  'word': 'Plaintiff and Respondent'},
 {'end': 410,
  'entity_group': 'ORG',
  'score': 0.8993166208267211,
  'start': 394,
  'word': 'Alameda County C'},
 {'end': 419,
  'entity_group': 'LOC',
  'score': 0.5051717758178711,
  'start': 417,
  'word': 'SA'},
 {'end': 428,
  'entity_group': 'ORG',
  'score': 0.7513080358505249,
  'start': 419,
  'word': '##N LEANDRO'},
 {'end': 468,
  'entity_group': 'ORG',
  'score': 0.65428626537323,
  'start': 467,
  'word': 'C'},
 {'end': 607,
  'entity_group': 'ORG',
  'score': 0.6117914915084839,
  'start': 606,
  'word': 'A'},
 {'end': 656,
  'entity_

In [14]:
entity_list1

[('CA', 'LOC'),
 ('##LIFORNI', 'ORG'),
 ('##A', 'LOC'),
 ('Plaintiff and Respondent', 'ORG'),
 ('Alameda County C', 'ORG'),
 ('SA', 'LOC'),
 ('##N LEANDRO', 'ORG'),
 ('C', 'ORG'),
 ('A', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power Systems', 'ORG'),
 ('Louis A. Rigaud', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power Systems', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power', 'ORG'),
 ('Heron Bay Homeowners Association', 'ORG'),
 ('Heron Bay HOA', 'ORG'),
 ('Code of Civil Procedure', 'MISC'),
 ('City of San Leandro', 'ORG'),
 ('San Leandro', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power', 'ORG'),
 ('Code of Civil Procedure', 'MISC'),
 ('##C', 'ORG'),
 ('San Lea', 'LOC'),
 ('##nd', 'ORG'),
 ('##ro', 'LOC'),
 ('San Francisco Bay', 'LOC'),
 ('Halus Power', 'ORG'),
 ('San Leandro', 'LOC'),
 ('California', 'LOC'),
 ('Cal', 'ORG')]

In [15]:
tokenizer_distil_con = AutoTokenizer.from_pretrained("elastic/distilbert-base-cased-finetuned-conll03-english")

model_distil_con = AutoModelForTokenClassification.from_pretrained("elastic/distilbert-base-cased-finetuned-conll03-english")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=954.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=260832555.0, style=ProgressStyle(descri…




In [16]:
preds['distilbert-cased'], entity_list2 = ner_predict(test_str1, model_distil_con, tokenizer_distil_con)
preds['distilbert-cased'], entity_list2

([{'end': 127,
   'entity_group': 'ORG',
   'score': 0.46831971406936646,
   'start': 125,
   'word': 'CA'},
  {'end': 128,
   'entity_group': 'ORG',
   'score': 0.524165689945221,
   'start': 127,
   'word': '##L'},
  {'end': 130,
   'entity_group': 'ORG',
   'score': 0.8408035635948181,
   'start': 128,
   'word': '##IF'},
  {'end': 132,
   'entity_group': 'ORG',
   'score': 0.4889625310897827,
   'start': 130,
   'word': '##OR'},
  {'end': 134,
   'entity_group': 'ORG',
   'score': 0.45630520582199097,
   'start': 132,
   'word': '##NI'},
  {'end': 135,
   'entity_group': 'ORG',
   'score': 0.671537458896637,
   'start': 134,
   'word': '##A'},
  {'end': 177,
   'entity_group': 'ORG',
   'score': 0.536799430847168,
   'start': 176,
   'word': '##P'},
  {'end': 179,
   'entity_group': 'ORG',
   'score': 0.5410783290863037,
   'start': 177,
   'word': '##EL'},
  {'end': 181,
   'entity_group': 'ORG',
   'score': 0.3587299585342407,
   'start': 179,
   'word': '##LA'},
  {'end': 183,
 

In [17]:
entity_list2

[('CA', 'ORG'),
 ('##L', 'ORG'),
 ('##IF', 'ORG'),
 ('##OR', 'ORG'),
 ('##NI', 'ORG'),
 ('##A', 'ORG'),
 ('##P', 'ORG'),
 ('##EL', 'ORG'),
 ('##LA', 'ORG'),
 ('##TE', 'ORG'),
 ('AS', 'ORG'),
 ('##SO', 'ORG'),
 ('##CI', 'ORG'),
 ('##AT', 'ORG'),
 ('##ION', 'ORG'),
 ('Plain', 'ORG'),
 ('##ti', 'ORG'),
 ('##ff and Respondent', 'ORG'),
 ('Al', 'LOC'),
 ('##amed', 'LOC'),
 ('##a County', 'LOC'),
 ('SA', 'MISC'),
 ('##N', 'MISC'),
 ('##EA', 'LOC'),
 ('##RO', 'LOC'),
 ('Super', 'LOC'),
 ('Re', 'ORG'),
 ('##sp', 'ORG'),
 ('##ond', 'ORG'),
 ('##ent', 'ORG'),
 ('A', 'ORG'),
 ('##lant', 'ORG'),
 ('al', 'ORG'),
 ('Real Parties in Interest', 'ORG'),
 ('Appellants', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power Systems', 'ORG'),
 ('Louis A. Rigaud', 'ORG'),
 ('d', 'ORG'),
 ('##ba', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power Systems', 'ORG'),
 ('Hal', 'ORG'),
 ('##us Power', 'ORG'),
 ('Hero', 'ORG'),
 ('##n Bay Homeowners Association', 'ORG'),
 ('Hero', 'ORG'),
 ('##n Bay HOA', 'ORG'),
 ('Code of Civil Procedur

### SpaCy

In [18]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
test1 = nlp(test_str1)
test2 = nlp(test_str2)

In [20]:
test1_ents = [(X.text, X.label_) for X in test1.ents]
test2_ents = [(X.text, X.label_) for X in test2.ents]
test1_ents

[('1/12/18', 'CARDINAL'),
 ('CALIFORNIA', 'GPE'),
 ('FIRST', 'ORDINAL'),
 ('FOUR', 'CARDINAL'),
 ('HERON BAY HOMEOWNERS', 'LOC'),
 ('Plaintiff', 'ORG'),
 ('Respondent', 'GPE'),
 ('A143985', 'PERSON'),
 ('Alameda County', 'GPE'),
 ('SAN LEANDRO', 'GPE'),
 ('Super', 'ORG'),
 ('Respondent, Defendant', 'ORG'),
 ('Appellant', 'ORG'),
 ('HALUS POWER SYSTEMS et al.', 'ORG'),
 ('Appellants', 'ORG'),
 ('Halus Power Systems', 'ORG'),
 ('Louis A. Rigaud', 'PERSON'),
 ('dba Halus Power Systems', 'ORG'),
 ('Halus Power', 'ORG'),
 ('Heron Bay Homeowners Association', 'ORG'),
 ('Heron Bay HOA', 'PERSON'),
 ('1021.5', 'CARDINAL'),
 ('Defendant City of San Leandro', 'GPE'),
 ('San Leandro', 'GPE'),
 ('Halus Power', 'ORG'),
 ('1', 'CARDINAL'),
 ('the Code of Civil Procedure', 'WORK_OF_ART'),
 ('1', 'CARDINAL'),
 ('San Leandro', 'GPE'),
 ('the San Francisco Bay', 'LOC'),
 ('Halus Power', 'FAC'),
 ('approximately five-acre', 'QUANTITY'),
 ('San Leandro', 'GPE'),
 ('The California Environmental Quality Act

In [21]:
[(X, X.ent_iob_, X.ent_type_) for X in test1]

[(Filed, 'O', ''), (1/12/18, 'B', 'CARDINAL'), (
  
                              , 'O', ''), (CERTIFIED, 'O', ''), (FOR,
  'O',
  ''), (PUBLICATION, 'O', ''), (
  
                  , 'O', ''), (IN, 'O', ''), (THE, 'O', ''), (COURT,
  'O',
  ''), (OF, 'O', ''), (APPEAL, 'O', ''), (OF, 'O', ''), (THE, 'O', ''), (STATE,
  'O',
  ''), (OF, 'O', ''), (CALIFORNIA, 'B', 'GPE'), (
  
                                 , 'O', ''), (FIRST,
  'B',
  'ORDINAL'), (APPELLATE, 'O', ''), (DISTRICT, 'O', ''), (
  
                                        , 'O', ''), (DIVISION, 'O', ''), (FOUR,
  'B',
  'CARDINAL'), (
  
  , 'O', ''), (HERON, 'B', 'LOC'), (BAY, 'I', 'LOC'), (HOMEOWNERS,
  'I',
  'LOC'), (, 'O', ''), (ASSOCIATION, 'O', ''), (,, 'O', ''), (
          , 'O', ''), (Plaintiff, 'B', 'ORG'), (and, 'O', ''), (Respondent,
  'B',
  'GPE'), (,, 'O', ''), (                 , 'O', ''), (A143985,
  'B',
  'PERSON'), (
  , 'O', ''), (v., 'O', ''), (                                                ,
  'O

In [22]:
labels1 = [x.label_ for x in test1.ents]
labels2 = [x.label_ for x in test2.ents]
Counter(labels1), Counter(labels2)

(Counter({'CARDINAL': 232,
          'DATE': 51,
          'FAC': 4,
          'GPE': 59,
          'LAW': 7,
          'LOC': 17,
          'MONEY': 59,
          'ORDINAL': 18,
          'ORG': 158,
          'PERCENT': 9,
          'PERSON': 127,
          'PRODUCT': 9,
          'QUANTITY': 3,
          'TIME': 7,
          'WORK_OF_ART': 11}),
 Counter({'CARDINAL': 259,
          'DATE': 129,
          'GPE': 39,
          'LANGUAGE': 1,
          'LAW': 5,
          'MONEY': 35,
          'NORP': 8,
          'ORDINAL': 36,
          'ORG': 124,
          'PERCENT': 40,
          'PERSON': 111,
          'PRODUCT': 1,
          'TIME': 14,
          'WORK_OF_ART': 10}))

In [23]:
items1 = [x.text for x in test1.ents]
items2 = [x.text for x in test2.ents]
Counter(items1).most_common(3), Counter(items2).most_common(3)

([('Heron Bay HOA', 71), ('San Leandro', 30), ('supra', 21)],
 [('4th', 25), ('California', 19), ('Rogers', 15)])

In [24]:
displacy.render(test1, jupyter=True, style='ent')

In [25]:
displacy.render(test2, jupyter=True, style='ent')

In [26]:
def spacy_ner(text):
  res = nlp(text)
  displacy.render(res, jupyter=True, style='ent')

  return res

### NLTK

In [27]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags

from nltk.draw import TreeWidget
from nltk.draw.util import CanvasFrame
from IPython.display import Image

In [28]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [29]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    tagged = nltk.pos_tag(sent)
    return tagged

In [30]:
proc_str1 = preprocess(test_str1)
proc_str1

[('Filed', 'VBD'),
 ('1/12/18', 'CD'),
 ('CERTIFIED', 'NNP'),
 ('FOR', 'NNP'),
 ('PUBLICATION', 'NNP'),
 ('IN', 'NNP'),
 ('THE', 'NNP'),
 ('COURT', 'NNP'),
 ('OF', 'NNP'),
 ('APPEAL', 'NNP'),
 ('OF', 'IN'),
 ('THE', 'NNP'),
 ('STATE', 'NNP'),
 ('OF', 'NNP'),
 ('CALIFORNIA', 'NNP'),
 ('FIRST', 'NNP'),
 ('APPELLATE', 'NNP'),
 ('DISTRICT', 'NNP'),
 ('DIVISION', 'NNP'),
 ('FOUR', 'NNP'),
 ('HERON', 'NNP'),
 ('BAY', 'NNP'),
 ('HOMEOWNERS', 'NNP'),
 ('ASSOCIATION', 'NNP'),
 (',', ','),
 ('Plaintiff', 'NNP'),
 ('and', 'CC'),
 ('Respondent', 'NNP'),
 (',', ','),
 ('A143985', 'NNP'),
 ('v.', 'NN'),
 ('(', '('),
 ('Alameda', 'NNP'),
 ('County', 'NNP'),
 ('CITY', 'NNP'),
 ('OF', 'NNP'),
 ('SAN', 'NNP'),
 ('LEANDRO', 'NNP'),
 (',', ','),
 ('Super', 'NNP'),
 ('.', '.'),
 ('Ct.', 'NNP'),
 ('No', 'NNP'),
 ('.', '.'),
 ('RG13677840', 'NNP'),
 (')', ')'),
 ('Respondent', 'NNP'),
 (',', ','),
 ('Defendant', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('Appellant', 'NNP'),
 (';', ':'),
 ('HALUS', 'NNP'),
 ('PO

In [31]:
proc_str2 = preprocess(test_str2)
proc_str2

[('Filed', 'VBD'),
 ('11/10/15', 'CD'),
 ('CERTIFIED', 'NNP'),
 ('FOR', 'NNP'),
 ('PUBLICATION', 'NNP'),
 ('IN', 'NNP'),
 ('THE', 'NNP'),
 ('COURT', 'NNP'),
 ('OF', 'NNP'),
 ('APPEAL', 'NNP'),
 ('OF', 'IN'),
 ('THE', 'NNP'),
 ('STATE', 'NNP'),
 ('OF', 'NNP'),
 ('CALIFORNIA', 'NNP'),
 ('FIRST', 'NNP'),
 ('APPELLATE', 'NNP'),
 ('DISTRICT', 'NNP'),
 ('DIVISION', 'NNP'),
 ('ONE', 'NNP'),
 ('JOEL', 'NNP'),
 ('I.', 'NNP'),
 ('ROOS', 'NNP'),
 ('et', 'FW'),
 ('al.', 'NN'),
 (',', ','),
 ('Plaintiffs', 'NNP'),
 ('and', 'CC'),
 ('Respondents', 'NNP'),
 (',', ','),
 ('A142156', 'NNP'),
 ('v.', 'NNP'),
 ('HONEYWELL', 'NNP'),
 ('INTERNATIONAL', 'NNP'),
 (',', ','),
 ('INC.', 'NNP'),
 (',', ','),
 ('(', '('),
 ('San', 'NNP'),
 ('Francisco', 'NNP'),
 ('Super', 'NNP'),
 ('.', '.'),
 ('Ct.', 'NNP'),
 ('No', 'NNP'),
 ('.', '.'),
 ('CGC-04-436205', 'NNP'),
 (')', ')'),
 ('Defendant', 'NNP'),
 ('and', 'CC'),
 ('Respondent', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('ART', 'NNP'),
 ('ROGERS', 'NNP'),
 ('et', 

In [32]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [33]:
chunk_parser = nltk.RegexpParser(pattern)
chunked = chunk_parser.parse(proc_str1)
type(chunked)
# chunked.draw()
# tree = nltk.Tree(chunked)

nltk.tree.Tree

In [34]:
# !pip install svgling

In [35]:
def disp_tree(chunked):
  nltk.draw.tree.TreeView(chunked)._cframe.print_to_file('output.ps')
  os.system('convert output.ps output.png')

  out_tree = Image(filename='output.png')
  
  return out_tree

In [36]:
def jupyter_draw_nltk_tree(chunked):
    cf = CanvasFrame()
    tc = TreeWidget(cf.canvas(), tree)
    tc['node_font'] = 'arial 13 bold'
    tc['leaf_font'] = 'arial 14'
    tc['node_color'] = '#005990'
    tc['leaf_color'] = '#3F8F57'
    tc['line_color'] = '#175252'
    cf.add_widget(tc, 10, 10)
    cf.print_to_file('tmp_tree_output.ps')
    cf.destroy()
    os.system('convert tmp_tree_output.ps tmp_tree_output.png')
    display(Image(filename='tmp_tree_output.png'))
    os.system('rm tmp_tree_output.ps tmp_tree_output.png')

In [37]:
# print(jupyter_draw_nltk_tree(chunked))

In [38]:
iob_tagged = tree2conlltags(chunked)
iob_tagged

[('Filed', 'VBD', 'O'),
 ('1/12/18', 'CD', 'O'),
 ('CERTIFIED', 'NNP', 'O'),
 ('FOR', 'NNP', 'O'),
 ('PUBLICATION', 'NNP', 'O'),
 ('IN', 'NNP', 'O'),
 ('THE', 'NNP', 'O'),
 ('COURT', 'NNP', 'O'),
 ('OF', 'NNP', 'O'),
 ('APPEAL', 'NNP', 'O'),
 ('OF', 'IN', 'O'),
 ('THE', 'NNP', 'O'),
 ('STATE', 'NNP', 'O'),
 ('OF', 'NNP', 'O'),
 ('CALIFORNIA', 'NNP', 'O'),
 ('FIRST', 'NNP', 'O'),
 ('APPELLATE', 'NNP', 'O'),
 ('DISTRICT', 'NNP', 'O'),
 ('DIVISION', 'NNP', 'O'),
 ('FOUR', 'NNP', 'O'),
 ('HERON', 'NNP', 'O'),
 ('BAY', 'NNP', 'O'),
 ('HOMEOWNERS', 'NNP', 'O'),
 ('ASSOCIATION', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Plaintiff', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Respondent', 'NNP', 'O'),
 (',', ',', 'O'),
 ('A143985', 'NNP', 'O'),
 ('v.', 'NN', 'B-NP'),
 ('(', '(', 'O'),
 ('Alameda', 'NNP', 'O'),
 ('County', 'NNP', 'O'),
 ('CITY', 'NNP', 'O'),
 ('OF', 'NNP', 'O'),
 ('SAN', 'NNP', 'O'),
 ('LEANDRO', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Super', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('Ct.', 'NNP', 'O'),
 (

In [39]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(test_str1)))
# ne_tree

In [40]:
def nltk_ner(text, pattern = 'NP: {<DT>?<JJ>*<NN>}', ):
  proc = preprocess(text)

  cp = nltk.RegexpParser(pattern)
  cs = cp.parse(proc_str1)

  iob_tagged = tree2conlltags(cs)

  ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(test_str1)))

  

## Counsel Extractor 

In [41]:
def petCounsel(sentences):
  appellant_sub = 'for Defendant and Appellant'
  appellant_sentence = [x for x in sentences if re.search(appellant_sub,x)] 

  appellant_sub = 'for Defendant/Appellant'
  appellant_sentence += [x for x in sentences if re.search(appellant_sub,x)]

  appellant_sub = 'for\nDefendant'
  appellant_sentence += [x for x in sentences if re.search(appellant_sub,x)]

  appellant_sub = 'for Petitioner'
  appellant_sentence += [x for x in sentences if re.search(appellant_sub,x)]

  appellant_sub = 'for Defendant'
  appellant_sentence += [x for x in sentences if re.search(appellant_sub,x)]

  things_to_remove = ['for', 'counsel', 'Appellant', 'and', 'Petitioner', 'Defendant', 'Respondent', 'Appeal', 'under appointment by the Court of']

  for i in things_to_remove:
    appellant_sentence = [sub.replace(i, '') for sub in appellant_sentence]

  petitioner_counsel = [sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in appellant_sentence]
  # petitioner_counsel = list(dict.fromkeys(petitioner_counsel))
  return ' '.join(petitioner_counsel)

In [42]:
def pet_counsel_check(case_file):

  sentences1 = case_file.split('\n')
  
  appellant_sub = 'for Defendant and Appellant'
  appellant_sentence1 = [x for x in sentences1 if re.search(appellant_sub,x)] 
  # print(appellant_sentence1)

  appellant_sub = 'for Defendant/Appellant'
  appellant_sentence2 = [x for x in sentences1 if re.search(appellant_sub,x)]
  # print(appellant_sentence2)

  appellant_sub = 'for\nDefendant'
  appellant_sentence3 = [x for x in sentences1 if re.search(appellant_sub,x)]
  # print(appellant_sentence3)

  appellant_sub = 'for Petitioner'
  appellant_sentence4 = [x for x in sentences1 if re.search(appellant_sub,x)]
  # print(appellant_sentence4)

  appellant_sub = 'for Defendant'
  appellant_sentence5 = [x for x in sentences1 if re.search(appellant_sub,x)]
  print(appellant_sentence5)

  things_to_remove = ['for', 'counsel', 'Counsel', 'Appellant', 'and', 'Petitioner', 'Defendant', 'Respondent', 'Appeal', 'under appointment by the Court of']

  # for i in things_to_remove:
  #   appellant_sentence = [sub.replace(i, '') for sub in appellant_sentence1]
  # print(appellant_sentence)

  # for i in things_to_remove:
  #   appellant_sentence = [sub.replace(i, '') for sub in appellant_sentence2]
  # print(appellant_sentence)

  # for i in things_to_remove:
  #   appellant_sentence = [sub.replace(i, '') for sub in appellant_sentence3]
  # print(appellant_sentence)
  
  # for i in things_to_remove:
  #   appellant_sentence = [sub.replace(i, '') for sub in appellant_sentence4]
  # print(appellant_sentence)

  for i in things_to_remove:
    appellant_sentence5 = [(sub.replace(i, '')) for sub in appellant_sentence5]
  print(appellant_sentence5)

  petitioner_counsel = [sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in appellant_sentence5]

  return ' '.join(petitioner_counsel)

In [43]:
# Function Testing
# pet_counsel_check(df['CaseFile'][10])

In [44]:
# Component Testing
sentences1 = df['CaseFile'][10].split('\n')
sentences1

['Filed 11/10/15',
 '                           CERTIFIED FOR PUBLICATION',
 '',
 '             IN THE COURT OF APPEAL OF THE STATE OF CALIFORNIA',
 '',
 '                             FIRST APPELLATE DISTRICT',
 '',
 '                                      DIVISION ONE',
 '',
 '',
 'JOEL I. ROOS et al.,',
 '        Plaintiffs and Respondents,',
 '                                                   A142156',
 'v.',
 'HONEYWELL INTERNATIONAL, INC.,                     (San Francisco',
 '                                                   Super. Ct. No. CGC-04-436205)',
 '        Defendant and Respondent, and',
 'ART ROGERS et al.,',
 '        Objectors and Appellants.',
 '',
 '',
 '        Four objectors—Art Rogers, Chuck Congdon, Richard Moser, and Amanda',
 'Waldenville—appeal from the trial court’s order approving an $8.15 million settlement of',
 'a class action against Honeywell International Inc. and awarding a portion of the',
 'settlement as fees to class counsel. The trial court fo

In [45]:
appellant_sub = 'for Defendant'
# appellant_sentence5 = [x for x in sentences1 if re.search(appellant_sub,x)]
flag = 0
for idx, sentence in enumerate(sentences1):

  if idx == 878:
    print(idx, ':', sentence)
  if re.search(appellant_sub, sentence):
    flag = 4

  if flag>0:
    print(idx, ':', sentence)
    flag -= 1

# print(appellant_sentence5)

878 :                              Alexander M. Schack
880 : Counsel for Defendant and    O’Melveny & Myers LLP, Michael Frederick Tubach,
881 : Respondent Honeywell         Christina J. Brown
882 : International, Inc.
883 : 


In [46]:
appellant_sub = 'for Defendant'
appellant_sentence5 = [x for x in sentences1 if re.search(appellant_sub,x)]
appellant_sentence5

['Counsel for Defendant and    O’Melveny & Myers LLP, Michael Frederick Tubach,']

In [47]:
things_to_remove = ['for', 'counsel', 'Counsel', 'Appellant', 'and', 'Petitioner', 'Defendant', 'Respondent', 'Appeal', 'under appointment by the Court of']

for i in things_to_remove:
    appellant_sentence5 = [(sub.replace(i, '')) for sub in appellant_sentence5]
    
appellant_sentence5

['       O’Melveny & Myers LLP, Michael Frederick Tubach,']

In [48]:
petitioner_counsel = [sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in appellant_sentence5]
petitioner_counsel = ' '.join(petitioner_counsel)
petitioner_counsel

'O’Melveny  Myers LLP Michael Frederick Tubach'

In [49]:
spacy_ner(petitioner_counsel)

O’Melveny  Myers LLP Michael Frederick Tubach

In [241]:
def ner_tester(row, fl=2):

  sentences = row['CaseFile'].split('\n')

  # Pre Post sentence check
  appellant_sub = 'for Defendant'
  flag = 0
  appellant_sentence = []
  
  for idx, sentence in enumerate(sentences):

    if re.search(appellant_sub, sentence):
      flag = fl

      # Printing previous (flag) number of lines
      # for i in range(flag):
      #   # print(f"i: {i}, idx: {idx}, idx-i: {idx-i}")
      #   print(idx-flag+i, ':', sentences[idx-flag+i])

      # Printing next (flag) number of lines and appending
      for i in range(flag):
        print(idx+i, ':', sentences[idx+i])
        appellant_sentence.append(sentences[idx+i])
        # print(appellant_sentence)
      

    # if flag>0:
    #   print(flag, ':', sentence)
    #   # appellant_sentence.append(sentence)

    #   flag -= 1

  print('\n')
  print(f"Extracted Appellant Sentence: {appellant_sentence}")

  # Keyword Removal
  things_to_remove = ['for', 'counsel', 'Counsel', 'Appellant', 'and Appellant' 'Petitioner', 'Defendant', 'Defendant and', 'Respondent', 'Appeal', 'under appointment by the Court of',
                      # 'and',

                      ]

  for i in things_to_remove:
      appellant_sentence = [(sub.replace(i, '')) for sub in appellant_sentence]

  # appellant_sentence = [sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in appellant_sentence]
  # appellant_sentence = [re.sub('/^,|,$|(,)+/g', '$1', sub) for sub in appellant_sentence]
  appellant_sentence = ' '.join(appellant_sentence)
  print(f"Removed Sentence: {appellant_sentence}")

  # Running NER function
  print("\nRunning NER on sentence")
  spacy_ner(appellant_sentence)
  preds, entity_list = ner_predict(appellant_sentence, model_base, tokenizer_base)
  print(f"\nResult of NER using Transformer {entity_list}")

  # Counsel list creation and processing
  counsel_list = appellant_sentence.split(',')
  counsel_list = [x.strip().strip('.') for x in counsel_list if x.split()]
  # counsel_list = [x for x in counsel_list if x]
  # counsel_list = [x.strip('and').strip(' ') for x in counsel_list if x.strip('and')]
  counsel_list = [x.replace('and', '').strip() for x in counsel_list if x.replace('and', '').strip()]
  # counsel_list = [x for x in counsel_list if x]
  # print(counsel_list)

  print("\nRunning NER on list items")
  ner_list = [spacy_ner(x) for x in counsel_list]
  print(f"Punctuation Removed: {counsel_list}") # vs NER Extracted: {ner_list}")

  
  print(f"Counsel present in table: {row['PetitionerCounsel']}")

  return counsel_list, appellant_sentence, entity_list

In [242]:
petitioner_counsel, appellant_sentence, entity_list = ner_tester(df.iloc[9])

858 : Counsel for Defendant,           Myers Nave, Richard Delmendo PioRoda and
859 : Respondent, and Appellant,       Edward A. Grutzmacher.


Extracted Appellant Sentence: ['Counsel for Defendant,           Myers Nave, Richard Delmendo PioRoda and', 'Respondent, and Appellant,       Edward A. Grutzmacher.']
Removed Sentence:   ,           Myers Nave, Richard Delmendo PioRoda and , and ,       Edward A. Grutzmacher.

Running NER on sentence



Result of NER using Transformer [('Myers Nave', 'PER'), ('Richard Delmendo PioRoda', 'PER'), ('Edward A. Grutzmacher', 'PER')]
['Myers Nave', 'Richard Delmendo PioRoda', 'Edward A. Grutzmacher']

Running NER on list items



[W006] No entities to visualize found in Doc object. If this is surprising to you, make sure the Doc was processed using a model that supports named entity recognition, and check the `doc.ents` property manually if necessary.



Punctuation Removed: ['Myers Nave', 'Richard Delmendo PioRoda', 'Edward A. Grutzmacher']
Counsel present in table: Counsel             Myers Nave Richard Delmendo PioRoda


In [243]:
# print([x.strip('and').strip() for x in petitioner_counsel if x.strip('and')])

In [244]:
petitioner_counsel, appellant_sentence, entity_list = ner_tester(df.iloc[10]) # Org on left side causes an issue

880 : Counsel for Defendant and    O’Melveny & Myers LLP, Michael Frederick Tubach,
881 : Respondent Honeywell         Christina J. Brown


Extracted Appellant Sentence: ['Counsel for Defendant and    O’Melveny & Myers LLP, Michael Frederick Tubach,', 'Respondent Honeywell         Christina J. Brown']
Removed Sentence:    and    O’Melveny & Myers LLP, Michael Frederick Tubach,  Honeywell         Christina J. Brown

Running NER on sentence



Result of NER using Transformer [('O ’ Melveny & Myers LLP', 'ORG'), ('Michael Frederick Tu', 'PER'), ('##bach', 'ORG'), ('Honey', 'ORG'), ('Christina', 'PER'), ('J', 'ORG'), ('Brown', 'PER')]
['O’Melveny & Myers LLP', 'Michael Frederick Tubach', 'Honeywell         Christina J. Brown']

Running NER on list items


Punctuation Removed: ['O’Melveny & Myers LLP', 'Michael Frederick Tubach', 'Honeywell         Christina J. Brown']
Counsel present in table: Counsel       O’Melveny  Myers LLP Michael Frederick Tubach


In [245]:
petitioner_counsel, appellant_sentence, entity_list = ner_tester(df.iloc[14]) # split on and as well, dont remove

32 :                    Ferrentino & Associates and Correen Ferrentino, for Defendant and
33 : Appellant.


Extracted Appellant Sentence: ['                   Ferrentino & Associates and Correen Ferrentino, for Defendant and', 'Appellant.']
Removed Sentence:                    Ferrentino & Associates and Correen Ferrentino,   and .

Running NER on sentence



Result of NER using Transformer [('Ferrentino & Associates', 'ORG'), ('Correen Ferrentino', 'ORG')]
['Ferrentino & Associates  Correen Ferrentino']

Running NER on list items


Punctuation Removed: ['Ferrentino & Associates  Correen Ferrentino']
Counsel present in table: Ferrentino  Associates  Correen Ferrentino


In [237]:
print(petitioner_counsel)
# print([x.strip('and').split('and') for x in petitioner_counsel if x.strip('and').split('and')])
print([x.split('and') for x in petitioner_counsel if x])

for x in petitioner_counsel:
  if x:
    print(x.split('and'))
    # print(x.split('and'))

['Ferrentino & Associates  Correen Ferrentino']
[['Ferrentino & Associates  Correen Ferrentino']]
['Ferrentino & Associates  Correen Ferrentino']


In [238]:
petitioner_counsel, appellant_sentence, entity_list = ner_tester(df.iloc[df[df['PetitionerCounsel'].notnull()].index[7]])

37 :          Robert McLaughlin, under appointment by the Court of Appeal, for Defendant and
38 : 


Extracted Appellant Sentence: ['         Robert McLaughlin, under appointment by the Court of Appeal, for Defendant and', '']
Removed Sentence:          Robert McLaughlin,  ,   and 

Running NER on sentence



Result of NER using Transformer [('Robert McLaughlin', 'PER')]
['Robert McLaughlin']

Running NER on list items


Punctuation Removed: ['Robert McLaughlin']
Counsel present in table: Robert McLaughlin


In [221]:
print([x.replace('and', '').strip() for x in petitioner_counsel if x.strip('and')])


# for idx, x in enumerate(petitioner_counsel):
#   if x:
#     print(idx, x.replace('and', ''))

['Myers Nave', 'Richard Delmendo PioRoda', 'Edward A. Grutzmacher']


In [239]:
petitioner_counsel, appellant_sentence, entity_list = ner_tester(df.iloc[df[df['PetitionerCounsel'].notnull()].index[8]])
# petitioner_counsel

30 :          Allen G. Weinberg, under appointment by the Court of Appeal, for Defendant and
31 : 
34 :          Patricia L. Brisbois, under appointment by the Court of Appeal, for Defendant and
35 : 
38 :          Susan S. Bauguess, under appointment by the Court of Appeal, for Defendant and
39 : 


Extracted Appellant Sentence: ['         Allen G. Weinberg, under appointment by the Court of Appeal, for Defendant and', '', '         Patricia L. Brisbois, under appointment by the Court of Appeal, for Defendant and', '', '         Susan S. Bauguess, under appointment by the Court of Appeal, for Defendant and', '']
Removed Sentence:          Allen G. Weinberg,  ,   and           Patricia L. Brisbois,  ,   and           Susan S. Bauguess,  ,   and 

Running NER on sentence



Result of NER using Transformer [('Allen G. Weinberg', 'PER'), ('Patricia L. Brisbois', 'PER'), ('Susan S. Bauguess', 'PER')]
['Allen G. Weinberg', 'Patricia L. Brisbois', 'Susan S. Bauguess']

Running NER on list items


Punctuation Removed: ['Allen G. Weinberg', 'Patricia L. Brisbois', 'Susan S. Bauguess']
Counsel present in table: Allen G Weinberg Patricia L Brisbois Susan S Bauguess


In [240]:
petitioner_counsel = ner_tester(df.iloc[df[df['PetitionerCounsel'].notnull()].index[9]])

28 :         Pauline E. Villanueva, under appointment by the Court of Appeal, for Defendant
29 : 


Extracted Appellant Sentence: ['        Pauline E. Villanueva, under appointment by the Court of Appeal, for Defendant', '']
Removed Sentence:         Pauline E. Villanueva,  ,   

Running NER on sentence



Result of NER using Transformer [('Pauline E. Villanueva', 'PER')]
['Pauline E. Villanueva']

Running NER on list items


Punctuation Removed: ['Pauline E. Villanueva']
Counsel present in table: Pauline E Villanueva


In [67]:
petitioner_counsel, appellant_sentence, entity_list = ner_tester(df.iloc[df[df['PetitionerCounsel'].notnull()].index[11]])

33 :                    Erica Gambale, under appointment by the Court of Appeal, for Defendant
34 : and Appellant.


Extracted Appellant Sentence: ['                   Erica Gambale, under appointment by the Court of Appeal, for Defendant', 'and Appellant.']
Removed Sentence:                    Erica Gambale,  ,    .

Running NER on sentence



Result of NER using Transformer [('Erica Gambale', 'PER')]

Running NER on list items



[W006] No entities to visualize found in Doc object. If this is surprising to you, make sure the Doc was processed using a model that supports named entity recognition, and check the `doc.ents` property manually if necessary.



Punctuation Removed: ['Erica Gambale', '']
Counsel present in table: Erica Gambale


- Print internal function prints
- Print appellant_sentence, final version print and return
- Run spacy_ner on appellant_sentence
- Run spacy_ner on counsel_list
- Return counsel_list

- ~~Next line, third counsel name missing~~
  - Use flag=1 OR check until next empty string on next line (flag=3or4, append until empty) 
- ~~Punctuations should not get removed since they separate counsel names~~
  - ~~Split on , and remove~~

In [251]:
def extract_petitioners(row, fl=2, print_st=True):

  sentences = row['CaseFile'].split('\n')

  # Pre Post sentence check
  appellant_sub = 'for Defendant'
  flag = 0
  appellant_sentence = []
  

  for idx, sentence in enumerate(sentences):

    if re.search(appellant_sub, sentence):
      # flag = fl

      # Printing next (flag) number of lines and appending
      for i in range(fl):
        if print_st:
          print(idx+i, ':', sentences[idx+i])
        appellant_sentence.append(sentences[idx+i])
    

  if print_st:
    print('\n')
    print(f"Extracted Appellant Sentence: {appellant_sentence}")

  # Keyword Removal
  things_to_remove = ['for', 'counsel', 'Counsel', 'Appellant', 'and Appellant' 'Petitioner', 'Defendant', 'Defendant and', 'Respondent', 'Appeal', 'under appointment by the Court of',
                      # 'and',
                      ]

  for i in things_to_remove:
      appellant_sentence = [(sub.replace(i, '')) for sub in appellant_sentence]

  appellant_sentence = ' '.join(appellant_sentence)

  if print_st:
    print(f"Removed Sentence: {appellant_sentence}")


  # Running NER function
  if print_st:
    print("\nRunning NER on sentence")

    # DisplaCy
    spacy_ner(appellant_sentence)

  preds, entity_list = ner_predict(appellant_sentence, model_base, tokenizer_base)

  if print_st:
    print(f"\nResult of NER using Transformer {entity_list}")


  # Counsel list creation and processing
  counsel_list = appellant_sentence.split(',')
  counsel_list = [x.strip().strip('.') for x in counsel_list if x.split()]
  counsel_list = [x.replace('and', '').strip() for x in counsel_list if x.replace('and', '').strip()]

  if print_st:
    print("\nRunning NER on list items")
    ner_list = [spacy_ner(x) for x in counsel_list]
    print(ner_list)
  if print_st:
    print(f"Punctuation Removed: {counsel_list}") # vs NER Extracted: {ner_list}")

  if print_st:
   print(f"Counsel present in table: {row['PetitionerCounsel']}")

  return counsel_list, appellant_sentence

In [255]:
petitioner_counsel, appellant_sentence = extract_petitioners(df.iloc[df[df['PetitionerCounsel'].notnull()].index[8]], print_st=True)
# petitioner_counsel

30 :          Allen G. Weinberg, under appointment by the Court of Appeal, for Defendant and
31 : 
34 :          Patricia L. Brisbois, under appointment by the Court of Appeal, for Defendant and
35 : 
38 :          Susan S. Bauguess, under appointment by the Court of Appeal, for Defendant and
39 : 


Extracted Appellant Sentence: ['         Allen G. Weinberg, under appointment by the Court of Appeal, for Defendant and', '', '         Patricia L. Brisbois, under appointment by the Court of Appeal, for Defendant and', '', '         Susan S. Bauguess, under appointment by the Court of Appeal, for Defendant and', '']
Removed Sentence:          Allen G. Weinberg,  ,   and           Patricia L. Brisbois,  ,   and           Susan S. Bauguess,  ,   and 

Running NER on sentence



Result of NER using Transformer [('Allen G. Weinberg', 'PER'), ('Patricia L. Brisbois', 'PER'), ('Susan S. Bauguess', 'PER')]

Running NER on list items


[Allen G. Weinberg, Patricia L. Brisbois, Susan S. Bauguess]
Punctuation Removed: ['Allen G. Weinberg', 'Patricia L. Brisbois', 'Susan S. Bauguess']
Counsel present in table: Allen G Weinberg Patricia L Brisbois Susan S Bauguess


Similarly for Respondent Counsel

In [None]:
def resCounsel(sentences):
  respondent_sub = 'for Plaintiff and Respondent'
  respondent_sentence = [x for x in sentences if re.search(respondent_sub,x)] 

  respondent_sub = 'for Respondent'
  respondent_sentence += [x for x in sentences if re.search(respondent_sub,x)]

  respondent_sub = 'for Plaintiff'
  respondent_sentence += [x for x in sentences if re.search(respondent_sub,x)]

  respondent_sub = 'for\nPlaintiff'
  respondent_sentence += [x for x in sentences if re.search(respondent_sub,x)]

  respondent_sub = 'for Plaintiff/Respondent'
  respondent_sentence += [x for x in sentences if re.search(respondent_sub,x)]

  things_to_remove = ['for', 'counsel', 'Respondent', 'and', 'Plaintiff', 'Defendant', 'petitioner', 'Appellant', 'under appointment by the Court of']

  for i in things_to_remove:
    respondent_sentence = [sub.replace(i, '') for sub in respondent_sentence]

  respondent_counsel = [sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in respondent_sentence]
  # petitioner_counsel = list(dict.fromkeys(petitioner_counsel))
  print(respondent_counsel)

  return ' '.join(respondent_counsel)

### New Counsel Extractor

In [None]:
# pet = pd.DataFrame(df['CaseFile'][5:500].apply(lambda x: petCounsel(x.split('\n'))))
# pet[pet['CaseFile'].notnull()]

In [None]:
# res = df['CaseFile'].apply(lambda x: resCounsel(x))
# res