In [17]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [18]:
data = pd.read_csv("ocr_opinions.csv")

In [19]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [20]:
data

Unnamed: 0,Author,Type,Text
0,KAGAN,Opinion,When a party who has agreed to arbitrate a dis...
1,BREYER,Opinion,When the government encourages diverse express...
2,KAVANAUGH,Concurrence,This dispute arose only because of a governmen...
3,ALITO,Concurrence,I agree with the Court’s conclusion that Bosto...
4,GORSUCH,Concurrence,The real problem in this case doesn’t stem fro...
...,...,...,...
347,KAVANAUGH,Concurrence,I join the Court’s opinion in full. In Part II...
348,SOTOMAYOR,Opinion,"In Kokesh v. SEC, 581 U.S. ___ (2017), this Co..."
349,THOMAS,Dissent,The Court correctly declines to affirm the Nin...
350,KAVANAUGH,Opinion,"Under the immigration laws, a noncitizen who i..."


In [21]:
le = preprocessing.LabelEncoder()
data["Type"] = le.fit_transform(data["Type"])
data

Unnamed: 0,Author,Type,Text
0,KAGAN,2,When a party who has agreed to arbitrate a dis...
1,BREYER,2,When the government encourages diverse express...
2,KAVANAUGH,0,This dispute arose only because of a governmen...
3,ALITO,0,I agree with the Court’s conclusion that Bosto...
4,GORSUCH,0,The real problem in this case doesn’t stem fro...
...,...,...,...
347,KAVANAUGH,0,I join the Court’s opinion in full. In Part II...
348,SOTOMAYOR,2,"In Kokesh v. SEC, 581 U.S. ___ (2017), this Co..."
349,THOMAS,1,The Court correctly declines to affirm the Nin...
350,KAVANAUGH,2,"Under the immigration laws, a noncitizen who i..."


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_df = 0.2, min_df = 30, stop_words = 'english')
counts = vec.fit_transform(data['Text']) 

counts_df = pd.DataFrame(counts.toarray(), columns = vec.get_feature_names_out())
counts_df.head(3)

df = pd.concat((data,counts_df),axis = 1)
df.head()

Unnamed: 0,Author,Type,Text,101,102,103,104,105,106,107,...,wright,writ,write,writing,written,wrote,www,yes,yield,young
0,KAGAN,2,When a party who has agreed to arbitrate a dis...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BREYER,2,When the government encourages diverse express...,0,0,0,0,0,0,0,...,0,0,0,1,2,0,4,0,0,0
2,KAVANAUGH,0,This dispute arose only because of a governmen...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ALITO,0,I agree with the Court’s conclusion that Bosto...,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
4,GORSUCH,0,The real problem in this case doesn’t stem fro...,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,6,0,1


In [23]:
# Create training and testing data
from sklearn.model_selection import train_test_split

train,test = train_test_split(df, test_size = 0.3)

X_train = train.drop(['Author','Type','Text'],axis = 1)
y_train = train['Type']

X_test = test.drop(['Author','Type','Text'],axis = 1)
y_test = test['Type']

In [26]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter = 500)
LR.fit(X_train,y_train)

LogisticRegression(max_iter=500)

In [32]:
?LogisticRegression

In [30]:
# Cross-validation score
from sklearn.model_selection import cross_val_score
cross_val_score(LR,X_train,y_train,cv = 5).mean()

TypeError: cross_val_score() got an unexpected keyword argument 'n_splits'

In [29]:
C_pool = np.linspace(0.005,0.05,10)
best_score = -np.inf
for c in C_pool:
    LR = LogisticRegression(C = c, max_iter = 500)
    score = cross_val_score(LR,X_train,y_train,cv = 5).mean()
    if score > best_score:
        best_score = score
        best_c = c
    print("C = ", np.round(c,3), "CrossValScore = ", score)



C =  0.005 CrossValScore =  0.5324897959183674




C =  0.01 CrossValScore =  0.5244081632653061




C =  0.015 CrossValScore =  0.5204081632653061




C =  0.02 CrossValScore =  0.5244897959183673




C =  0.025 CrossValScore =  0.5244897959183673




C =  0.03 CrossValScore =  0.5244897959183673




C =  0.035 CrossValScore =  0.53665306122449




C =  0.04 CrossValScore =  0.5284897959183674




C =  0.045 CrossValScore =  0.5325714285714286




C =  0.05 CrossValScore =  0.5366530612244897


In [36]:
LR = LogisticRegression(C = best_c, max_iter = 500)
LR.fit(X_train,y_train)
LR.score(X_test,y_test)

0.5471698113207547

In [37]:
sentiment_df = pd.DataFrame({'word':X_train.columns, 'coef':LR.coef_[0]})
sentiment_df

Unnamed: 0,word,coef
0,101,0.007461
1,102,0.001147
2,103,0.007293
3,104,-0.000455
4,105,0.000434
...,...,...
1730,wrote,-0.010713
1731,www,-0.009544
1732,yes,-0.004383
1733,yield,-0.005905


In [38]:
# Likely related to opinions
sentiment_df.sort_values(['coef'],ascending = True).head(10)

Unnamed: 0,word,coef
796,documents,-0.105209
1564,stay,-0.104875
726,death,-0.078603
1021,immunity,-0.073437
654,concurrence,-0.073257
570,capacity,-0.070447
1329,principal,-0.067851
967,governor,-0.066112
1681,vehicle,-0.06457
629,comment,-0.059421


In [39]:
# Likely related to concurrence
sentiment_df.sort_values(['coef'],ascending = False).head(10)

Unnamed: 0,word,coef
1727,write,0.085848
1331,privilege,0.056261
1160,mandate,0.054445
851,entry,0.053162
1083,johnson,0.047166
666,consent,0.047103
586,catholic,0.046863
1129,license,0.045122
588,causes,0.044424
1624,timely,0.044251


## Text Classification for Type using tensorflow

In [40]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow.keras import layers
from tensorflow.keras import losses

# requires update to tensorflow 2.4
# >>> conda activate PIC16B
# >>> pip install tensorflow==2.4
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

2022-06-02 14:23:50.545567: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [41]:
data

Unnamed: 0,Author,Type,Text
0,KAGAN,2,When a party who has agreed to arbitrate a dis...
1,BREYER,2,When the government encourages diverse express...
2,KAVANAUGH,0,This dispute arose only because of a governmen...
3,ALITO,0,I agree with the Court’s conclusion that Bosto...
4,GORSUCH,0,The real problem in this case doesn’t stem fro...
...,...,...,...
347,KAVANAUGH,0,I join the Court’s opinion in full. In Part II...
348,SOTOMAYOR,2,"In Kokesh v. SEC, 581 U.S. ___ (2017), this Co..."
349,THOMAS,1,The Court correctly declines to affirm the Nin...
350,KAVANAUGH,2,"Under the immigration laws, a noncitizen who i..."


In [44]:
type_df = data.drop(["Author"], axis = 1)
type_df

Unnamed: 0,Type,Text
0,2,When a party who has agreed to arbitrate a dis...
1,2,When the government encourages diverse express...
2,0,This dispute arose only because of a governmen...
3,0,I agree with the Court’s conclusion that Bosto...
4,0,The real problem in this case doesn’t stem fro...
...,...,...
347,0,I join the Court’s opinion in full. In Part II...
348,2,"In Kokesh v. SEC, 581 U.S. ___ (2017), this Co..."
349,1,The Court correctly declines to affirm the Nin...
350,2,"Under the immigration laws, a noncitizen who i..."


In [45]:
# 0: concurrence, 1: dissent, 2: opinion
type_df.groupby("Type").size()

Type
0    106
1    114
2    128
3      4
dtype: int64

In [46]:
type_data = tf.data.Dataset.from_tensor_slices((type_df["Text"], type_df["Type"]))

2022-06-02 14:27:41.420607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [47]:
type_data = type_data.shuffle(buffer_size = len(type_data))

In [55]:
# Split data into 70% train, 10% validation, 20% test

train_size = int(0.7*len(type_data)) 
val_size = int(0.1*len(type_data))

type_train = type_data.take(train_size) 
type_val = type_data.skip(train_size).take(val_size)
type_test = type_data.skip(train_size+val_size)

In [56]:
# standardize the text: remove punctuation, make everything lowercase
def standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    no_punctuation = tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation),'')
    return no_punctuation 

In [58]:
max_tokens = 2000
sequence_length = 25 

vectorize_layer = TextVectorization(
    standardize =  standardization, 
    output_mode = 'int', 
    max_tokens = max_tokens, 
    output_sequence_length =  sequence_length
)

opinion_type = type_train.map(lambda x, y: x)
vectorize_layer.adapt(opinion_type)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [59]:
def vectorize_headline(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), [label]

train_vec = type_train.map(vectorize_headline)
val_vec = type_val.map(vectorize_headline)
test_vec = type_test.map(vectorize_headline)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [61]:
model = tf.keras.Sequential([
    layers.Embedding(max_tokens, output_dim = 3, name = "embedding"),
    layers.Dropout(0.2), # randomly "drop" or delete 20% of connections between previous layer and the next layer
    layers.GlobalAveragePooling1D(), # take the average along an axis
    layers.Dropout(0.2), # forces it to forget info randomly at times, forces model to do better job overall (learning global patterns rather than noise)
    layers.Dense(3)
])

In [62]:
model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = True),
              optimizer = "adam",
              metrics = ["accuracy"])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 3)           6000      
                                                                 
 dropout_2 (Dropout)         (None, None, 3)           0         
                                                                 
 global_average_pooling1d_1   (None, 3)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_3 (Dropout)         (None, 3)                 0         
                                                                 
 dense (Dense)               (None, 3)                 12        
                                                                 
Total params: 6,012
Trainable params: 6,012
Non-trainable params: 0
______________________________________________________

In [63]:
history = model.fit(train_vec, epochs = 20, validation_data = val_vec)

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
 52/246 [=====>........................] - ETA: 0s - loss: 1.1011 - accuracy: 0.3462

2022-06-02 14:52:31.092100: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at sparse_xent_op.cc:103 : INVALID_ARGUMENT: Received a label value of 3 which is outside the valid range of [0, 3).  Label values: 3


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/traitlets/config/application.py", line 846, in launch_instance
      app.start()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 677, in start
      self.io_loop.start()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 367, in dispatch_shell
      await result
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2915, in run_cell
      raw_cell, store_history, silent, shell_futures)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3186, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/zw/j8lwp6_56fn4hgrcqc6rcjwm0000gn/T/ipykernel_80213/3661932423.py", line 1, in <module>
      history = model.fit(train_vec, epochs = 20, validation_data = val_vec)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/training.py", line 1419, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/training.py", line 1054, in train_function
      return step_function(self, iterator)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/training.py", line 1043, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/training.py", line 1033, in run_step
      outputs = model.train_step(data)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/training.py", line 893, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/training.py", line 952, in compute_loss
      y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/engine/compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/losses.py", line 139, in __call__
      losses = call_fn(y_true, y_pred)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/losses.py", line 243, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/losses.py", line 1865, in sparse_categorical_crossentropy
      y_true, y_pred, from_logits=from_logits, axis=axis)
    File "/Users/ashwinvasan/opt/anaconda3/envs/PIC16B/lib/python3.7/site-packages/keras/backend.py", line 5357, in sparse_categorical_crossentropy
      labels=target, logits=output)
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
Received a label value of 3 which is outside the valid range of [0, 3).  Label values: 3
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_1491]