In [145]:
%load_ext autoreload
%autoreload 2

In [165]:
import os
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras


In [119]:
# down load nltk data
nltk.download('stopwords')
nltk.download('punkt')

# setting stopword
stop = stopwords.words('english')
stop.remove('no')
stop.remove('not')

# setting lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/robert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1.Load Data

In [7]:
justice_path = './dataset/'
justice_data = pd.read_csv(os.path.join(justice_path,'justice.csv'))

### 2.Preprocessing for Data

#### 2.1 subset of interested columns

In [9]:
columns_intersted = ['ID', 
                     'first_party',
                     'second_party',
                     'facts',
                     'first_party_winner']

In [10]:
justice_data_new = justice_data[columns_intersted].copy(deep=True)

#### 2.2A quick investigation of the new data

In [18]:
justice_data_new[justice_data_new['first_party'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
2762,61978,,In Re Winship,"<p>At age twelve, Samuel Winship was arrested ...",True


In [19]:
justice_data_new[justice_data_new['second_party'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
1257,54848,In re Bauer,,<p>Pro se petitioner Frederick W. Bauer sought...,False


In [20]:
justice_data_new[justice_data_new['first_party_winner'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
223,51803,United States,California,<p>Channel Islands National Monument is a nati...,
1322,54908,New Hampshire,Maine,"<p>In 1977, a dispute between New Hampshire an...",
1674,55282,Alaska,United States,<p>Alaska and the United States disputed owner...,
1721,55334,"Bank of China, New York Branch","NBM L.L.C., et al.",<p>Bank of China alleged that John Chou and Sh...,
1890,55514,State of New Jersey,State of Delaware,<p>When British Petroleum (BP) wanted to build...,
2023,55652,Alabama,North Carolina,<p>Several states belonging to the Southeast I...,
2137,55781,Montana,Wyoming and North Dakota,"<p>1950, Montana, Wyoming and North Dakota sig...",
2528,60033,Dusky,United States,<p>Dusky was charged with kidnapping and rape....,
2631,61030,South Carolina,Katzenbach,<p>The Voting Rights Act of 1965 prevented sta...,
2787,62121,Johnson,Louisiana,<p>The Louisiana State Constitution and Code o...,


As a results, there are NaN values in our data. We should remove/drop these noise rows.

#### 2.3 drop the rows with NaN or missing values 

In [26]:
justice_data_new.dropna(inplace=True)

In [34]:
justice_data_new#['facts'][1]#[justice_data_new['second_party'].isna()]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,50606,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",True
1,50613,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,True
2,50623,John Giglio,United States,<p>John Giglio was convicted of passing forged...,True
3,50632,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",True
4,50643,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",True
...,...,...,...,...,...
3298,63324,United States,Refugio Palomar-Santiago,"<p>Refugio Palomar-Santiago, a Mexican nationa...",True
3299,63323,Tarahrick Terry,United States,<p>Tarahrick Terry pleaded guilty to one count...,False
3300,63331,United States,Joshua James Cooley,<p>Joshua James Cooley was parked in his picku...,True
3301,63332,Florida,Georgia,<p>This is an ongoing case of original jurisdi...,False


### 3.Clean the data

In [None]:
def data_clean(text):
  text = text.lower()
  text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?|p", "", text)
  text_tokens = word_tokenize(text)
  text_stop = [word for word in text_tokens if word not in (stop)]
  text_new = ' '.join(text_stop)
  return text_new

In [141]:
def data_clean_1(first_party, second_party, fact):
    """This is to clean the facts. Will remove the first party and second party name
    
    Args:
        first_party:    first party name
        second_party:   second party name
        fact:           fact text
        
    return:
        cleaned fact"""
    # 1. lower-case for the text, also remove some punctuations 
    first_party_text = first_party.lower().replace(',', '')
    second_party_text = second_party.lower().replace(',', '')
    fact_text = fact.lower().replace(',', '')
    fact_text = fact_text.replace('<p>', '')
    fact_text = fact_text.replace('</p>\n', '')
    # print(fact_text)

    # 2. replace/remove the name of the first party and second party
    first_party_split = [x for x in first_party_text.split(' ') if x.strip()]
    second_party_split = [x for x in second_party_text.split(' ') if x.strip()]
    first_second_party = first_party_split + second_party_split
    # print(first_second_party)
    fact_split = ' '.join([x for x in fact_text.split(' ') if x not in first_second_party])

    # 3. remove punctuations and numbers
    fact_clean = re.sub('[^a-zA-Z]', ' ', fact_split)

    # 4. remove single character
    fact_clean = re.sub(r"\s+[a-zA-Z]\s+", ' ', fact_clean)

    # 5. remove multiple spaces
    fact_clean = re.sub(r'\s+', ' ', fact_clean)

    # 6. token 
    fact_clean = word_tokenize(fact_clean)

    # 7. remove stop words
    fact_clean = [word for word in fact_clean if word not in (stop)]

    # 8. lemmatizer 
    # fact_clean = [lemmatizer.lemmatize(word) for word in fact_clean]

    # join token to text 
    fact_clean = ' '.join(fact_clean)


    return fact_clean

    



In [142]:
justice_data_new['Cleaned_Facts'] = justice_data_new.apply(lambda x: data_clean_1(x.first_party, x.second_party, x.facts), axis=1)

In [143]:
justice_data_new['Cleaned_Facts'][0]

346

### 4.Feature engineering

#### 4.1 training data X

In [149]:
vectorizer_facts = TfidfVectorizer()
vectorizer_facts = vectorizer_facts.fit(justice_data_new['Cleaned_Facts'])
facts_nlp_feature=vectorizer_facts.transform(justice_data_new['Cleaned_Facts'])

In [152]:
facts_nlp_feature_array = facts_nlp_feature.toarray()
print(facts_nlp_feature_array.shape)

(3286, 17556)


#### 4.2 target labels--labelling the target

In [153]:
label_encoder = preprocessing.LabelEncoder()
data_label = label_encoder.fit_transform(justice_data_new['first_party_winner'])

In [171]:
print(data_label)
print(data_label[data_label==1].shape, data_label[data_label==0].shape)

[1 1 1 ... 1 0 1]
(2139,) (1147,)


In [191]:

X_train, X_test, y_train, y_test = train_test_split(facts_nlp_feature_array, data_label, test_size=0.3, random_state=42, shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2300, 17556) (986, 17556) (2300,) (986,)


In [192]:
y_train[0:100]#[y_test==1].shape

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0])

In [193]:
y_train[y_train==0].shape

(779,)

In [194]:
input_length = 200
word_nums = X_train.shape[1]//input_length
print(X_train[:, 0:input_length*word_nums].shape)
X_train_reshape = X_train[:,0:input_length*word_nums].reshape((-1,input_length,word_nums))
X_test_reshape = X_test[:,0:input_length*word_nums].reshape((-1,input_length,word_nums))
print(X_train_reshape.shape, X_test_reshape.shape)

(2300, 17400)
(2300, 200, 87) (986, 200, 87)


### 5.Build model

In [225]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(64,5, activation='relu', input_shape=(input_length, word_nums)))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2,strides=1,
                                       padding='same'))
model.add(tf.keras.layers.Conv1D(128,3,activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2,strides=1,
                                       padding='same'))
model.add(tf.keras.layers.Conv1D(256,5,activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2,strides=1,
                                       padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dropout(rate=0.2))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
# model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [226]:
model.fit(X_train_reshape, y_train, epochs=50, validation_data=(X_test_reshape, y_test), batch_size=256)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

KeyboardInterrupt: 

In [227]:
classifications= model.predict(X_test_reshape)



In [228]:
pred_test = classifications.argmax(axis=1)#[0:100]

In [229]:
y_test[0:100]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1])

In [231]:
accuracy_score(y_true=y_test, y_pred=pred_test)

0.6125760649087221