In [3]:
import os
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow import keras


In [5]:
# down load nltk data
nltk.download('stopwords')
nltk.download('punkt')

# setting stopword
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/robert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1.Load Data

In [7]:
justice_path = './dataset/'
justice_data = pd.read_csv(os.path.join(justice_path,'justice.csv'))

### 2.Preprocessing for Data

#### 2.1 subset of interested columns

In [9]:
columns_intersted = ['ID', 
                     'first_party',
                     'second_party',
                     'facts',
                     'first_party_winner']

In [10]:
justice_data_new = justice_data[columns_intersted].copy(deep=True)

#### 2.2A quick investigation of the new data

In [18]:
justice_data_new[justice_data_new['first_party'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
2762,61978,,In Re Winship,"<p>At age twelve, Samuel Winship was arrested ...",True


In [19]:
justice_data_new[justice_data_new['second_party'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
1257,54848,In re Bauer,,<p>Pro se petitioner Frederick W. Bauer sought...,False


In [20]:
justice_data_new[justice_data_new['first_party_winner'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
223,51803,United States,California,<p>Channel Islands National Monument is a nati...,
1322,54908,New Hampshire,Maine,"<p>In 1977, a dispute between New Hampshire an...",
1674,55282,Alaska,United States,<p>Alaska and the United States disputed owner...,
1721,55334,"Bank of China, New York Branch","NBM L.L.C., et al.",<p>Bank of China alleged that John Chou and Sh...,
1890,55514,State of New Jersey,State of Delaware,<p>When British Petroleum (BP) wanted to build...,
2023,55652,Alabama,North Carolina,<p>Several states belonging to the Southeast I...,
2137,55781,Montana,Wyoming and North Dakota,"<p>1950, Montana, Wyoming and North Dakota sig...",
2528,60033,Dusky,United States,<p>Dusky was charged with kidnapping and rape....,
2631,61030,South Carolina,Katzenbach,<p>The Voting Rights Act of 1965 prevented sta...,
2787,62121,Johnson,Louisiana,<p>The Louisiana State Constitution and Code o...,


As a results, there are NaN values in our data. We should remove/drop these noise rows.

#### 2.3 drop the rows with NaN or missing values 

In [26]:
justice_data_new.dropna(inplace=True)

In [34]:
justice_data_new#['facts'][1]#[justice_data_new['second_party'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,50606,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",True
1,50613,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,True
2,50623,John Giglio,United States,<p>John Giglio was convicted of passing forged...,True
3,50632,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",True
4,50643,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",True
...,...,...,...,...,...
3298,63324,United States,Refugio Palomar-Santiago,"<p>Refugio Palomar-Santiago, a Mexican nationa...",True
3299,63323,Tarahrick Terry,United States,<p>Tarahrick Terry pleaded guilty to one count...,False
3300,63331,United States,Joshua James Cooley,<p>Joshua James Cooley was parked in his picku...,True
3301,63332,Florida,Georgia,<p>This is an ongoing case of original jurisdi...,False


### 3.Clean the data

In [None]:
def data_clean(text):
  text = text.lower()
  text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?|p", "", text)
  text_tokens = word_tokenize(text)
  text_stop = [word for word in text_tokens if word not in (stop)]
  text_new = ' '.join(text_stop)
  return text_new

In [92]:
def data_clean_1(first_party, second_party, fact):
    """This is to clean the facts. Will remove the first party and second party name
    
    Args:
        first_party:    first party name
        second_party:   second party name
        fact:           fact text
        
    return:
        cleaned fact"""
    # 1. lower-case for the text, also remove some punctuations 
    first_party_text = first_party.lower().replace(',', '')
    second_party_text = second_party.lower().replace(',', '')
    fact_text = fact.lower().replace(',', '')
    fact_text = fact_text.replace('<p>', '')
    fact_text = fact_text.replace('</p>\n', '')
    # print(fact_text)

    # 2. replace/remove the name of the first party and second party
    first_party_split = [x for x in first_party_text.split(' ') if x.strip()]
    second_party_split = [x for x in second_party_text.split(' ') if x.strip()]
    first_second_party = first_party_split + second_party_split
    # print(first_second_party)
    fact_split = [x for x in fact_text.split(' ') if x not in first_second_party]

    return fact_split

    



In [96]:
justice_data_new#['Cleaned_Facts'][0]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,Cleaned_Facts
0,50606,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",True,"[in, 1970, (a, fictional, name, used, in, cour..."
1,50613,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,True,"[joan, had, three, children, with, stanley., ,..."
2,50623,John Giglio,United States,<p>John Giglio was convicted of passing forged...,True,"[was, convicted, of, passing, forged, money, o..."
3,50632,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",True,"[the, idaho, probate, code, specified, that, ""..."
4,50643,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",True,"[after, conducting, a, mass, mailing, campaign..."
...,...,...,...,...,...,...
3298,63324,United States,Refugio Palomar-Santiago,"<p>Refugio Palomar-Santiago, a Mexican nationa...",True,"[a, mexican, national, was, granted, permanent..."
3299,63323,Tarahrick Terry,United States,<p>Tarahrick Terry pleaded guilty to one count...,False,"[pleaded, guilty, to, one, count, of, possessi..."
3300,63331,United States,Joshua James Cooley,<p>Joshua James Cooley was parked in his picku...,True,"[was, parked, in, his, pickup, truck, on, the,..."
3301,63332,Florida,Georgia,<p>This is an ongoing case of original jurisdi...,False,"[this, is, an, ongoing, case, of, original, ju..."
