In [17]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [18]:
data = pd.read_csv("ocr_opinions.csv")

In [19]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [20]:
data

Unnamed: 0,Author,Type,Text
0,KAGAN,Opinion,When a party who has agreed to arbitrate a dis...
1,BREYER,Opinion,When the government encourages diverse express...
2,KAVANAUGH,Concurrence,This dispute arose only because of a governmen...
3,ALITO,Concurrence,I agree with the Court’s conclusion that Bosto...
4,GORSUCH,Concurrence,The real problem in this case doesn’t stem fro...
...,...,...,...
347,KAVANAUGH,Concurrence,I join the Court’s opinion in full. In Part II...
348,SOTOMAYOR,Opinion,"In Kokesh v. SEC, 581 U.S. ___ (2017), this Co..."
349,THOMAS,Dissent,The Court correctly declines to affirm the Nin...
350,KAVANAUGH,Opinion,"Under the immigration laws, a noncitizen who i..."


In [21]:
le = preprocessing.LabelEncoder()
data["Type"] = le.fit_transform(data["Type"])
data

Unnamed: 0,Author,Type,Text
0,KAGAN,2,When a party who has agreed to arbitrate a dis...
1,BREYER,2,When the government encourages diverse express...
2,KAVANAUGH,0,This dispute arose only because of a governmen...
3,ALITO,0,I agree with the Court’s conclusion that Bosto...
4,GORSUCH,0,The real problem in this case doesn’t stem fro...
...,...,...,...
347,KAVANAUGH,0,I join the Court’s opinion in full. In Part II...
348,SOTOMAYOR,2,"In Kokesh v. SEC, 581 U.S. ___ (2017), this Co..."
349,THOMAS,1,The Court correctly declines to affirm the Nin...
350,KAVANAUGH,2,"Under the immigration laws, a noncitizen who i..."


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_df = 0.2, min_df = 30, stop_words = 'english')
counts = vec.fit_transform(data['Text']) 

counts_df = pd.DataFrame(counts.toarray(), columns = vec.get_feature_names_out())
counts_df.head(3)

df = pd.concat((data,counts_df),axis = 1)
df.head()

Unnamed: 0,Author,Type,Text,101,102,103,104,105,106,107,...,wright,writ,write,writing,written,wrote,www,yes,yield,young
0,KAGAN,2,When a party who has agreed to arbitrate a dis...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BREYER,2,When the government encourages diverse express...,0,0,0,0,0,0,0,...,0,0,0,1,2,0,4,0,0,0
2,KAVANAUGH,0,This dispute arose only because of a governmen...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ALITO,0,I agree with the Court’s conclusion that Bosto...,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
4,GORSUCH,0,The real problem in this case doesn’t stem fro...,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,6,0,1


In [23]:
# Create training and testing data
from sklearn.model_selection import train_test_split

train,test = train_test_split(df, test_size = 0.3)

X_train = train.drop(['Author','Type','Text'],axis = 1)
y_train = train['Type']

X_test = test.drop(['Author','Type','Text'],axis = 1)
y_test = test['Type']

In [26]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter = 500)
LR.fit(X_train,y_train)

LogisticRegression(max_iter=500)

In [32]:
?LogisticRegression

In [30]:
# Cross-validation score
from sklearn.model_selection import cross_val_score
cross_val_score(LR,X_train,y_train,cv = 5).mean()

TypeError: cross_val_score() got an unexpected keyword argument 'n_splits'

In [29]:
C_pool = np.linspace(0.005,0.05,10)
best_score = -np.inf
for c in C_pool:
    LR = LogisticRegression(C = c, max_iter = 500)
    score = cross_val_score(LR,X_train,y_train,cv = 5).mean()
    if score > best_score:
        best_score = score
        best_c = c
    print("C = ", np.round(c,3), "CrossValScore = ", score)



C =  0.005 CrossValScore =  0.5324897959183674




C =  0.01 CrossValScore =  0.5244081632653061




C =  0.015 CrossValScore =  0.5204081632653061




C =  0.02 CrossValScore =  0.5244897959183673




C =  0.025 CrossValScore =  0.5244897959183673




C =  0.03 CrossValScore =  0.5244897959183673




C =  0.035 CrossValScore =  0.53665306122449




C =  0.04 CrossValScore =  0.5284897959183674




C =  0.045 CrossValScore =  0.5325714285714286




C =  0.05 CrossValScore =  0.5366530612244897


In [36]:
LR = LogisticRegression(C = best_c, max_iter = 500)
LR.fit(X_train,y_train)
LR.score(X_test,y_test)

0.5471698113207547

In [37]:
sentiment_df = pd.DataFrame({'word':X_train.columns, 'coef':LR.coef_[0]})
sentiment_df

Unnamed: 0,word,coef
0,101,0.007461
1,102,0.001147
2,103,0.007293
3,104,-0.000455
4,105,0.000434
...,...,...
1730,wrote,-0.010713
1731,www,-0.009544
1732,yes,-0.004383
1733,yield,-0.005905


In [38]:
# Likely related to opinions
sentiment_df.sort_values(['coef'],ascending = True).head(10)

Unnamed: 0,word,coef
796,documents,-0.105209
1564,stay,-0.104875
726,death,-0.078603
1021,immunity,-0.073437
654,concurrence,-0.073257
570,capacity,-0.070447
1329,principal,-0.067851
967,governor,-0.066112
1681,vehicle,-0.06457
629,comment,-0.059421


In [39]:
# Likely related to concurrence
sentiment_df.sort_values(['coef'],ascending = False).head(10)

Unnamed: 0,word,coef
1727,write,0.085848
1331,privilege,0.056261
1160,mandate,0.054445
851,entry,0.053162
1083,johnson,0.047166
666,consent,0.047103
586,catholic,0.046863
1129,license,0.045122
588,causes,0.044424
1624,timely,0.044251
