# Predict who wrote the Supreme Court opinions that don't have authors.

These are called [per curiam](https://en.wikipedia.org/wiki/Per_curiam_decision) decisions. Using the language in opinions that *do* have authors, we can probably predict who wrote them.

You should **probably use the Classification Template we used in class**; it isn't like this has anything useful in it. You might also want to take a look at the template from class, especially the part about **custom features** - I hear sentence length is an important one!

Also, the `ny-doctors` assignment is a little more of a walkthrough on the same topic. Might be helpful!

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

df = pd.read_csv("supreme-court-opinions.csv")
df.head()


Unnamed: 0,content,case,author
0,NOTICE: This opinion is subject to formal revi...,09-958,ROBERTS
1,NOTICE: This opinion is subject to formal revi...,09-958,BREYER
2,"No.\n10–1001 LUIS MARIANO MARTINEZ, PETITIONER...",10-1001,SCALIA
3,NOTICE: This opinion is subject to formal revi...,10-1001,KENNEDY
4,"No.\n10–1016 DANIEL COLEMAN, PETITIONER v. COU...",10-1016,GINSBURG


In [2]:
df.shape


(971, 3)

In [3]:
df["author"].value_counts(dropna=False)


THOMAS       160
ALITO        126
SCALIA       101
SOTOMAYOR    100
GINSBURG      99
BREYER        98
ROBERTS       77
NaN           72
KENNEDY       70
KAGAN         64
GORSUCH        4
Name: author, dtype: int64

In [5]:
df_has_author = df[~df["author"].isnull()]
df_per_curiam = df[df["author"].isnull()]


In [6]:
# vectorize "content"

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vec = CountVectorizer(stop_words = 'english', max_features=3000)

matrix = vec.fit_transform(df_has_author['content'].fillna('').str.replace("\d",""))
df_has_author_input_features = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
df_has_author_input_features.head()


Unnamed: 0,___,aa,ab,abandoned,ability,able,abood,abortion,abortions,abramski,...,wrongful,wrote,www,year,years,yes,york,young,zivotofsky,zone
0,6,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2,0,0,0,2,0,0,0,0,0,...,0,0,0,0,6,0,0,0,0,0
3,10,0,0,0,2,0,0,0,0,0,...,0,0,0,2,2,0,0,0,0,0
4,0,2,0,0,0,0,0,0,0,0,...,0,0,0,7,3,0,0,1,0,0


In [7]:
# vectorize "author"

df_has_author_output_features = pd.get_dummies(df_has_author['author'], prefix="CUSTOM", drop_first=True)
df_has_author_output_features.head()


Unnamed: 0,CUSTOM_BREYER,CUSTOM_GINSBURG,CUSTOM_GORSUCH,CUSTOM_KAGAN,CUSTOM_KENNEDY,CUSTOM_ROBERTS,CUSTOM_SCALIA,CUSTOM_SOTOMAYOR,CUSTOM_THOMAS
0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,0


In [8]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#     features_df_author.values,
#     df_has_author['author'],
#     test_size = 0.2)
# print(X_train_category.shape, X_test_category.shape, y_train_category.shape, y_test_category.shape)

train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(
    df_has_author_input_features.values,
    df_has_author_output_features.values,  # df_has_author_output_features["CUSTOM_BREYER"], #
    test_size = 0.2)
print(train_inputs.shape, test_inputs.shape, train_outputs.shape, test_outputs.shape)


(719, 3000) (180, 3000) (719, 9) (180, 9)


In [13]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

clf = DecisionTreeClassifier()

# clf.fit(X_train, y_train)
clf.fit(train_inputs, train_outputs)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [17]:
print(clf.score(train_inputs, train_outputs))


1.0
[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 ..., 
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [38]:
test_predictions = clf.predict(test_inputs)

def inverse_transform_label_encoder(row):
    if row['CUSTOM_BREYER']:
        return 'BREYER'
    elif row['CUSTOM_GINSBURG']:
        return 'GINSBURG'
    elif row['CUSTOM_GORSUCH']:
        return 'GORSUCH'
    elif row['CUSTOM_KAGAN']:
        return 'KAGAN'
    elif row['CUSTOM_KENNEDY']:
        return 'KENNEDY'
    elif row['CUSTOM_ROBERTS']:
        return 'ROBERTS'
    elif row['CUSTOM_SCALIA']:
        return 'SCALIA'
    elif row['CUSTOM_SOTOMAYOR']:
        return 'SOTOMAYOR'
    elif row['CUSTOM_THOMAS']:
        return 'THOMAS'

def name_names(array):
    return pd.DataFrame(array, columns=df_has_author_output_features.columns) \
        .apply(inverse_transform_label_encoder, axis=1)
    
test_predictions_series = name_names(test_predictions)
test_outputs_series = name_names(test_outputs)
    

In [34]:
(test_predictions_series == test_outputs_series).value_counts()


False    97
True     83
dtype: int64

In [36]:
matrix_per_curiam = vec.transform(df_per_curiam['content'].fillna('').str.replace("\d",""))
df_per_curiam_input_features = pd.DataFrame(matrix_per_curiam.toarray(), columns=vec.get_feature_names())
df_per_curiam_input_features.head()


Unnamed: 0,___,aa,ab,abandoned,ability,able,abood,abortion,abortions,abramski,...,wrongful,wrote,www,year,years,yes,york,young,zivotofsky,zone
0,7,0,1,0,0,1,0,0,0,0,...,0,0,0,2,6,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df_per_curiam_prediction_features = clf.predict(df_per_curiam_input_features)
name_names(df_per_curiam_prediction_features)


0      GINSBURG
1          None
2       KENNEDY
3          None
4          None
5     SOTOMAYOR
6          None
7          None
8     SOTOMAYOR
9          None
10         None
11     GINSBURG
12       THOMAS
13         None
14      ROBERTS
15         None
16      KENNEDY
17         None
18       SCALIA
19         None
20      KENNEDY
21         None
22         None
23         None
24         None
25       BREYER
26         None
27         None
28         None
29      KENNEDY
        ...    
42         None
43      ROBERTS
44         None
45         None
46      KENNEDY
47      ROBERTS
48         None
49         None
50         None
51         None
52       BREYER
53         None
54       THOMAS
55         None
56         None
57         None
58         None
59         None
60         None
61    SOTOMAYOR
62       THOMAS
63       THOMAS
64         None
65         None
66     GINSBURG
67     GINSBURG
68         None
69     GINSBURG
70      GORSUCH
71         None
Length: 72, dtype: objec