# Introduction
Capstone Project : "Law as Data"

Team :  
Matt Dakolios (mrd7f)
Evan Dickson (ejd8zh)
Sud Luthra (sl3zs)

Reference : This notebook is based on code and direction provided by Prof. R. C. Alvarado

# Import Libraries

In [1]:
import pandas as pd
import sqlalchemy

import os
import gensim

import numpy as np

import sqlite3

import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist

import plotly_express as px

In [2]:
#Import packages
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/sl3zs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
dbconn = sqlalchemy.create_engine('sqlite:///db/ussc_capstone.db')

In [4]:
dbconn.table_names()

['AUTHOR',
 'BOW',
 'CASE',
 'CORPUS',
 'CORPUS_COMPRESSED',
 'THETA',
 'THETA_NMF',
 'TOPICS',
 'TOPICS_NMF',
 'VOCAB',
 'YEAR']

In [5]:
OHCO = ['vol_num','case_num','position']
class db(): pass

In [6]:
db.CASE = pd.read_sql("CASE", dbconn).set_index(OHCO[:2])

In [7]:
db.CASE

Unnamed: 0_level_0,Unnamed: 1_level_0,year,full_date,opinion_count,concur,dissent,opinion,doc_len_sum,case_title
vol_num,case_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,1,1794,"February 7, 1794",1,0,0,1,5712,"Georgia v. Brailsford, Powell & Hopton"
3,6,1794,"February 18, 1794",1,0,0,1,2028,Glass v. The Betsey
3,17,1795,"February 20, 1795",1,0,0,1,2035,United States v. Hamilton
3,121,1795,AUGUST 1795,1,0,0,1,8452,United States v. Peters
3,171,1796,"March 8, 1796",5,3,1,1,30900,Hylton v. United States
...,...,...,...,...,...,...,...,...,...
554,407,2008,"June 25, 2008",2,0,1,1,110777,Kennedy v. Louisiana
554,471,2008,"June 26, 2008",5,3,1,1,81688,Exxon Shipping Co. v. Baker
554,527,2008,"June 26, 2008",3,1,1,1,72060,Morgan Stanley Capital Group Inc. v. Public Ut...
554,570,2008,"June 26, 2008",3,0,2,1,258289,District of Columbia v. Heller


In [8]:
db.CORPUS = pd.read_sql("CORPUS_COMPRESSED", dbconn).set_index(OHCO)

In [9]:
db.CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,doc_content,doc_len
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1
3,171,opinion,THE COURT delivered their opinions seriatim in...,7826
3,171,dissent,"DISSENT BY: CUSHING\nCUSHING, Justice. As I ha...",252
3,321,opinion,"ELSWORTH, Chief Justice. The question, how far...",824
3,321,dissent,"DISSENT BY: WILSON\nWILSON, Justice. I conside...",16365
3,386,opinion,"CHASE, Justice. The decision of one question d...",23714
...,...,...,...,...
554,471,dissent,DISSENT BY: Stevens (In Part); Ginsburg (In Pa...,10243
554,527,opinion,Justice Scalia delivered the opinion of the Co...,45723
554,527,dissent,DISSENT BY: Stevens \nDISSENT \nJustice Steven...,25244
554,570,opinion,Justice Scalia delivered the opinion of the Co...,107923


In [10]:
docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(db.CORPUS.doc_content)]

# Generate Model

In [11]:
# https://radimrehurek.com/gensim/models/doc2vec.html
model = Doc2Vec(docs,
                vector_size=39,
                window=2, 
                min_count=4,
                workers=4)



## Save Model

In [13]:
model.save("doc2vec.model")
model= Doc2Vec.load("doc2vec.model")

In [14]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f8694ea0dc0>

In [15]:
len(model.dv)

12348

In [16]:
type(model.dv)

gensim.models.keyedvectors.KeyedVectors

# Convert to Data Frame

In [17]:
d2vdf = pd.DataFrame(model.dv.get_normed_vectors(), index = db.CORPUS.index)

In [18]:
d2vdf.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,171,opinion,-0.131987,0.031495,-0.15751,-0.036648,0.266395,0.176351,-0.346898,-0.22334,-0.296971,0.104907,...,-0.231346,-0.126728,-0.118947,-0.119324,-0.052163,0.097735,0.12381,-0.011759,0.216493,-0.021252
3,171,dissent,-0.330225,-0.079053,-0.11703,0.063863,0.024338,0.093181,-0.22729,-0.102744,-0.059345,0.073833,...,0.19209,-0.150043,0.054172,0.062614,-0.238479,-0.039419,0.19252,-0.208336,-0.109831,0.152755


In [19]:
d2vdf.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
count,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,...,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0,12348.0
mean,-0.142899,-0.051302,0.038668,0.054191,0.002775,-0.012276,0.016223,-0.123786,-0.108564,0.021886,...,-0.054431,-0.191349,0.100779,0.055523,-0.086979,-0.060509,0.07135,-0.019323,0.013771,-0.039515
std,0.154667,0.132519,0.151254,0.144827,0.143231,0.143979,0.165751,0.158396,0.145992,0.134765,...,0.144877,0.128531,0.141862,0.146236,0.134983,0.155145,0.137229,0.138361,0.171506,0.138019
min,-0.625977,-0.515924,-0.456144,-0.537815,-0.527048,-0.467567,-0.569492,-0.712941,-0.593216,-0.528089,...,-0.644388,-0.633802,-0.475299,-0.45435,-0.548035,-0.574387,-0.456008,-0.5507,-0.657389,-0.515535
25%,-0.247751,-0.142827,-0.071034,-0.047592,-0.099939,-0.114734,-0.100662,-0.237829,-0.210508,-0.070252,...,-0.147676,-0.28293,0.00795,-0.042377,-0.181382,-0.164579,-0.017809,-0.115839,-0.104117,-0.131072
50%,-0.144513,-0.057042,0.039501,0.054892,0.003295,-0.01482,0.016833,-0.12661,-0.114325,0.023108,...,-0.060596,-0.197871,0.103823,0.059142,-0.093891,-0.058113,0.069382,-0.023831,0.015954,-0.039597
75%,-0.039476,0.039356,0.15069,0.15686,0.103861,0.087883,0.133959,-0.013937,-0.009957,0.11396,...,0.042573,-0.102795,0.197202,0.149501,0.006949,0.040691,0.160656,0.071513,0.138507,0.049803
max,0.440838,0.458822,0.524996,0.531215,0.502298,0.535953,0.55294,0.437706,0.548306,0.502404,...,0.505153,0.274109,0.614976,0.579241,0.479353,0.556777,0.519528,0.521853,0.647434,0.516111


# Save to DB

In [20]:
# save the df to database
d2vdf.to_sql('DOC2VEC', dbconn, index=True, if_exists='replace')

# Read and Test from DB

In [21]:
db.DOC2VEC   = pd.read_sql("DOC2VEC", dbconn).set_index(OHCO).sort_index()

In [22]:
db.DOC2VEC.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,171,dissent,-0.330225,-0.079053,-0.11703,0.063863,0.024338,0.093181,-0.22729,-0.102744,-0.059345,0.073833,...,0.19209,-0.150043,0.054172,0.062614,-0.238479,-0.039419,0.19252,-0.208336,-0.109831,0.152755
3,171,opinion,-0.131987,0.031495,-0.15751,-0.036648,0.266395,0.176351,-0.346898,-0.22334,-0.296971,0.104907,...,-0.231346,-0.126728,-0.118947,-0.119324,-0.052163,0.097735,0.12381,-0.011759,0.216493,-0.021252
