This code takes data from all US Supreme Court cases from the 1946 term to the 2015 term and attempts to predict whether the petitioner won.

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

!pip install category_encoders==2.*

from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from category_encoders import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from category_encoders import TargetEncoder

#load data from Google Drive
from google.colab import drive
drive.mount("/content/drive")
df=pd.read_csv('/content/drive/My Drive/scotus_database.csv', parse_dates=['date_decision', 'date_argument', 'date_reargument'], index_col='case_id')

#drop rows for which there is no identifiable winner
df=df.drop(df[(df['party_winning']==2) | (df['party_winning'].isnull())].index)
#drop other outcome columns that might leak info
df=df.drop(columns=['decision_type', 'declaration_unconstitutionality', 'case_disposition', 'disposition_unusual', 'precedent_alteration'])
#while the creators of the database do not classify these as outcome variables, they are variables that can only be coded after reading the opinion, 
#and thus have the potential to leak info
df=df.drop(columns=['decision_direction', 'decision_direction_dissent', 'authority_decision_one', 'authority_decision_two', 'date_reargument', 
                    'date_decision', 'majority_opinion_writer', 'majority_opinion_assigner', 'split_vote', 'majority_votes', 'minority_votes', 
                    'vote_unclear'])
#various identifiers of a particular case that are unique and have no predictive value
df=df.drop(columns=['docket_id', 'issues_id', 'vote_id', 'us_citation', 'court_citation', 'led_citation', 'lexis_citation', 'docket', 'case_name'])
#variables which, after playing around with a number of models, don't seem to contribute any useful information
df=df.drop(columns=['court', 'chief_justice', 'petitioner_state', 'respondent_state', 'jurisdiction', 'administrative_action_state', 
                    'district_court', 'case_origin_state', 'case_source_state', 'lower_court_disposition', 'issue_area', 'law_type'])
#split into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='party_winning'), df['party_winning'], test_size=.2, random_state=63)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=63)
#compute a base rate
print(f"base rate: {y_train.sum()/len(y_train)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
base rate: 0.6371630663353087


In [30]:
#change all dates to day of the term.
#compute and store the day on which each term starts (the first monday in October) so it can be used by day_of_term
term_start_dates=[]
for year in range(1946, 2016):
  term_start_dates.append(datetime(year=year, month=10, day=(7-datetime(year=year, month=10, day=1).weekday())%7+1))

#computes the day within a term that corresponds to a particular date, and returns it as an int (or NaN if given NaT)
def day_of_term(row, col):
  return (row[col]-term_start_dates[int(row['term'])-1946]).days

def wrangle(X):
  #change all the dates to days of the term
  X['argument_day']=X.apply(day_of_term, axis=1, args=['date_argument'])
  #drop the date columns, since they mess up the machine learning algorithms
  X=X.drop(columns=['date_argument'])
  ''' These didn't actually help prediction, but I will keep them in comments 
  in case I can build on them in the future.

  #map natural courts to names of sitting justices
  X['Justice_Black']=np.where(X['court']<=1502, 1, 0)
  X['Justice_Reed']=np.where(X['court']<=1404, 1, 0)
  X['Justice_Frankfurter']=np.where(X['court']<=1407, 1, 0)
  X['Justice_Douglas']=np.where(X['court']<=1504, 1, 0)
  X['Justice_Murphy']=np.where(X['court']<=1301, 1, 0)
  X['Justice_Jackson']=np.where(X['court']<=1401, 1, 0)
  #Justice Rutledge is ommitted because he died at almost the same time as Justice Murphy
  X['Justice_Burton']=np.where(X['court']<=1405, 1, 0)
  #Justice Vinson is ommitted because he was the chief, and so this will come from one hot encoding of that column
  X['Justice_Clark']=np.where((X['court']>=1302) & (X['court']<=1409), 1, 0)
  X['Justice_Minton']=np.where((X['court']>=1303) & (X['court']<=1403), 1, 0)
  #Justice Warren is ommitted because he was the chief, and so this will come from one hot encoding of that column
  X['Justice_Harlan']=np.where((X['court']>=1403) & (X['court']<=1502), 1, 0)
  X['Justice_Brennan']=np.where((X['court']>=1404) & (X['court']<=1603), 1, 0)
  X['Justice_Whittaker']=np.where((X['court']>=1405) & (X['court']<=1406), 1, 0)
  X['Justice_Stewart']=np.where((X['court']>=1406) & (X['court']<=1506), 1, 0)
  X['Justice_White']=np.where((X['court']>=1407) & (X['court']<=1605), 1, 0)
  X['Justice_Goldberg']=np.where((X['court']>=1408) & (X['court']<=1408), 1, 0)
  X['Justice_Fortas']=np.where((X['court']>=1409) & (X['court']<=1410), 1, 0)
  X['Justice_Marshall']=np.where((X['court']>=1410) & (X['court']<=1604), 1, 0)
  #Justice Burger is ommitted because he was the chief, and so this will come from one hot encoding of that column
  X['Justice_Blackmun']=np.where((X['court']>=1502) & (X['court']<=1606), 1, 0)
  X['Justice_Powell']=np.where((X['court']>=1503) & (X['court']<=1601), 1, 0)
  X['Justice_Rehnquist_associate']=np.where((X['court']>=1504) & (X['court']<=1507), 1, 0)
  X['Justice_Rehnquist_all']=np.where((X['court']>=1504) & (X['court']<=1607), 1, 0)
  X['Justice_Stevens']=np.where((X['court']>=1506) & (X['court']<=1703), 1, 0)
  X['Justice_OConnor']=np.where((X['court']>=1507) & (X['court']<=1701), 1, 0)
  X['Justice_Scalia']=np.where((X['court']>=1601) & (X['court']<=1704), 1, 0)
  X['Justice_Kennedy']=np.where((X['court']>=1603) & (X['court']<=1706), 1, 0)
  X['Justice_Souter']=np.where((X['court']>=1604) & (X['court']<=1702), 1, 0)
  X['Justice_Thomas']=np.where(X['court']>=1605, 1, 0)
  X['Justice_Ginsburg']=np.where((X['court']>=1606) & (X['court']<=1707), 1, 0)
  X['Justice_Breyer']=np.where(X['court']>=1607, 1, 0)
  #Justice Roberts is ommitted because he was the chief, and so this will come from one hot encoding of that column
  X['Justice_Alito']=np.where(X['court']>=1702, 1, 0)
  X['Justice_Sotomayor']=np.where(X['court']>=1703, 1, 0)
  X['Justice_Kagan']=np.where(X['court']>=1704, 1, 0)
  #X['Justice_Gorsuch']=np.where(X['court']>=1706, 1, 0)    Justices Gorsuch, Kavanaugh, and Barrett are omitted since
  #X['Justice_Kavanaugh']=np.where(X['court']>=1707, 1, 0)  they were appointed after our dataset ends
  #X['Justice_Barrett']=np.where(X['court']>=1709, 1, 0)
  X['republican_appointed_chief']=np.where(X['chief_justice']!='Vinson',1,0)
  X['republican_appointed_justices']=X['Justice_Harlan']+X['Justice_Brennan']+X['Justice_Whittaker']+X['Justice_Stewart']+X['Justice_Blackmun']+X['Justice_Powell']+X['Justice_Rehnquist_associate']+X['Justice_Stevens']+X['Justice_OConnor']+X['Justice_Scalia']+X['Justice_Kennedy']+X['Justice_Souter']+X['Justice_Thomas']+X['Justice_Alito']+X['republican_appointed_chief']
  '''
  return X

X_train=wrangle(X_train)
X_val=wrangle(X_val)
X_test=wrangle(X_test)


In [31]:
#A number of columns had values that only popped up one or a few times, so I thought it might help to get rid of those values, so I wrote a
#transformer to do that. It didn't seem to help, so I didn't end up using it.
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class DropUniqueTransformer(BaseEstimator, TransformerMixin):
  '''In the specified column, takes values that occure less than threshold times in the training set, and replaces them with fill_value'''
  def __init__(self, column, threshold=10, fill_value=np.NaN):
    self.column=column
    self.threshold=threshold
    self.fill_value=fill_value

  def fit(self, X, y):
    value_counts=X[self.column].value_counts()
    self.to_keep=[]
    for value in value_counts.index:
      if value_counts[value]>=self.threshold:
        self.to_keep.append(value)
      else:
        break
    return self

  def transform(self, X):
    for i in X.index:
      if not X[self.column][i] in self.to_keep:
        X[self.column][i]=self.fill_value
    return X

In [92]:
#Random Forest Model
best_score=0
best_md=0
best_ne=0
for ne in range (160, 250, 10):
  for md in range(10, 31):
    model=make_pipeline(OneHotEncoder(), SimpleImputer(), RandomForestClassifier(n_estimators=ne, max_depth=md, n_jobs=-1, random_state=63))
    model.fit(X_train, y_train)
    score=model.score(X_val, y_val)
    if score>best_score:
      best_score=score
      best_md=md
      best_ne=ne
print(f'best number of estimators: {best_ne}\nbest max depth:{best_md}\nbest score: {best_score}')

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

best number of estimators: 190
best max depth:15
best score: 0.6863532110091743


In [96]:
#Logistic Regression Model

#a number of columns like issue contain numbers that actually represent categorical information, so this is to tell the target incoder to encode 
#those. Argument day and term are the only truely numerical features in this dataset, and the relationship with term seems nonlinear.
cols=X_train.columns.to_list()
cols.remove("argument_day")


best_k=1
best_score=0
best_min_samples=1
for k in range(1, 14):
  for min_samples in [1, 30, 40, 50, 60, 70]:
    model=make_pipeline(TargetEncoder(cols=cols, min_samples_leaf=min_samples), SimpleImputer(), SelectKBest(k=k), LogisticRegression(n_jobs=-1))
    model.fit(X_train, y_train)
    score=accuracy_score(model.predict(X_val), y_val)
    if score>best_score:
      best_score=score
      best_k=k
      best_min_samples=min_samples

print(f'best k: {best_k}\nbest min samples leaf:{best_min_samples}\nbest score: {best_score}')

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

best k: 12
best min samples leaf:50
best score: 0.6662844036697247


In [64]:
#this is code that I used to get permutation scores to assess the importance of different features
score=model.score(X_val, y_val)
scores=[]
for col in X_val.columns:
  X_val_perm=X_val.copy()
  perm_score=0
  for i in range(10):
    X_val_perm[col]=np.random.permutation(X_val_perm[col])
    perm_score+=model.score(X_val_perm, y_val)
  scores.append(score-perm_score/10)


sc=pd.Series(scores, index=X_val.columns).sort_values()
print(sc)

law_minor_supplement           1.110223e-16
issue                          8.600917e-04
administrative_action          1.204128e-03
argument_day                   1.662844e-03
case_origin                    2.465596e-03
respondent                     2.924312e-03
lower_disposition_direction    3.268349e-03
petitioner                     3.956422e-03
lower_court_disagreement       4.185780e-03
term                           7.282110e-03
case_source                    8.084862e-03
cert_reason                    8.772936e-03
law_supplement                 1.892202e-02
dtype: float64
