In [1]:
# import necessary packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

# packages for feature extraction 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 

# Clustering to see where the algorithm groups text together 
from sklearn.cluster import KMeans , AgglomerativeClustering 

### Here we are going to be merging the datasets so that we can text analysis 


In [12]:
a = pd.read_csv('opinions_data_csv/new_appeal_data.csv')
a.head()

Unnamed: 0.1,Unnamed: 0,cases,headnote,text,type,lower court judge,case status
0,1622,ann skiffington vs. liberty mutual insurance...,"motor vehicle, insurance. insurance, motor veh...","following a motor vehicle accident, the plaint...",civil,Constance M. Sweeney,Affirm
1,1623,commonwealth vs. radhames gonzalez.,"corrected june 1, 2018controlled substances. ...","after a jury trial in superior court, the defe...",criminal,Thomas P. Billings,Affirm
2,1624,commonwealth vs. stanley fredericq.,"further appellate review granted, 480 mass. 1...",as a result of information gathered in connect...,criminal,0.0,0.0
3,1625,"cedar-fieldstone marketplace, lp vs. t.s. fi...","guaranty. contract, lease of real estate, rel...","in this case, we consider whether the release ...",civil,Renee P. Dupuis,Affirm
4,1626,"citadel realty, llc vs. endeavor capital nor...","practice, civil, interlocutory appeal, motion ...",this appeal presents occasion to clarify the s...,civil,William F. Sullivan,reverse


In [14]:
b = pd.read_csv('opinions_data_csv/new_sjc_data.csv')
b.head()

Unnamed: 0.1,Unnamed: 0,cases,headnote,text,type,caseid,appealed,judge
0,317,in the matter of e.c.,"sjc-12230incompetent person, commitment. prac...","in this appeal, we consider whether the dismis...",criminal,sjc-12230,Yes,Not Mentioned
1,318,commonwealth vs. joseph wright.,"sjc-11950homicide. constitutional law, admiss...","the defendant, joseph wright, appeals from two...",criminal,sjc-11950,No,"Howard J. Whitehead, J."
2,319,"brian rafferty vs. merck & co., inc., & anot...","sjc-12347negligence, pharmaceutical manufactur...","under federal law, a manufacturer of a generic...",civil,sjc-12347,No,"Kenneth J. Fishman, J."
3,320,commonwealth vs. johnelle m. brown.,sjc-12313assault and battery. intimidation of...,a jury in the district court convicted the def...,criminal,sjc-12313,No,"Michele B. Hogan, J."
4,321,commonwealth vs. g.f.,"amended april 13, 2018. amended may 11, 2018.s...","this case concerns g. l. c. 123a, the statute ...",civil,,Partially,Not Mentioned


In [2]:
appeal_ = pd.read_csv('opinions_data_csv/new_appeal_data.csv')
sjc_ = pd.read_csv('opinions_data_csv/new_sjc_data.csv')

In [9]:
data_

Unnamed: 0.1,Unnamed: 0,case status,headnote,lower court judge,text
122,1744,affirm,"contract, promissory estoppel. damages, quant...","Bruce R. Henry, J.",plaintiff ronald nardone brought suit against ...
104,421,affirm,"sjc-12495sex offender. due process of law, se...","Mark C. Gildea, J.",we are called upon once again to determine the...
291,608,affirm,"sjc-12590practice, civil, action in nature of ...","Christine M. Roach, J.",christian miranda appeals from a judgment of t...
258,575,reverse,sjc-12622public records. department of public...,"Karen F. Green, J.","in this public records case, boston globe medi..."
167,1789,reverse,"adoption, dispensing with parent's consent. m...","Katherine A. Field, J.","from his home in guatemala, the father sought ..."
118,1740,reverse,"insurance, motor vehicle insurance, uninsured ...","Renee Paula Dupuis, J.","the plaintiff, derrick martins oliveira, filed..."
134,451,affirm,"sjc-12513supreme judicial court, superintenden...",Not Mentioned,jack saade appeals from a judgment of the coun...
241,558,reverse,"sjc-12606way, public: defect. municipal corp...","Peter M. Lauriat, J.",while riding his bicycle on sudbury street in ...
162,479,affirm,"sjc-12484motor vehicle, operating under the in...","Tracy L. Lyons, J.","on an afternoon in july 2015, a state police o..."
247,564,affirm,"sjc-12632sex offender. evidence, sex offender...","Christopher Barry-Smith, J.","the issue presented in this appeal is whether,..."


In [3]:
for idx , _  in sjc_.iterrows():
    if sjc_.loc[idx,'appealed'] == "Yes":
        sjc_.loc[idx,'appealed'] = 'reverse'
        
    elif sjc_.loc[idx,'appealed'] == "No":
        sjc_.loc[idx,'appealed'] = 'affirm'
    
    else:
        sjc_.loc[idx,'appealed'] = 'partially reversed'
        
# rename so that our dataframes have the same columns before concatenating 
sjc_.rename(columns={'judge':'lower court judge','appealed':'case status'},inplace=True)

In [4]:
# going to load in the data, and combine into a df 

# remove one column that we don't need 
sjc_.drop(['caseid'],axis=1,inplace=True)


# load the old csv file as well 
# old_data = pd.read_csv('cases.csv')

In [5]:
# concat the dataframes to have all the opinions in one dataframe for analysis 
data_ = pd.concat([appeal_,sjc_],axis=0)
data_ = data_.sample(frac=1) # this will randomly shuffle the data for us 

data_affirm = data_[data_['case status'] == 'affirm']
data_reverse = data_[data_['case status'] == 'reverse']
# filter out the case title from what we are going to analyze 
# and drop the columns
info_abt_data = data_.filter(['cases','type','lower court judge'])

data_.drop(['type','cases'],axis=1,inplace=True)

# fill the nan values with empty string 
data_.fillna('',inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


# In this section, we are going to run the CountVectorizer on the affirm and reverse cases: see if there are unique phrases between the headnotes

In [None]:
#Count Vectorizer for text extraction of the headnote phrases 
cv1 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
                      ngram_range=(2,3),max_df=.55,min_df = 5)


v1 = cv1.fit_transform(data_affirm['headnote'])

In [None]:
# These can be potential key phrases that 
cv1.get_feature_names()

In [None]:
# Count Vectorizer for text extraction of the headnote phrases 
cv2 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
                      ngram_range=(2,3),max_df=.55,min_df = 5)

v2 = cv2.fit_transform(data_reverse['headnote'])

In [None]:
cv2.get_feature_names()

In [None]:
# Let'see how many words are in common 
count_ngram = 0 

for phrase in cv1.get_feature_names():
    if phrase in cv2.get_feature_names():
        count_ngram+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {count_ngram}")
print(f"There is a {count_ngram} percent overlap between the frequent words in both types of cases")

## Here we are going to analyze = 'word'

In [None]:
#Count Vectorizer for text extraction of the headnote words
cv3 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
                      analyzer='word',max_df=.55,min_df = 5)


v3 = cv3.fit_transform(data_affirm['headnote'])

In [None]:
#Count Vectorizer for text extraction of the headnote words 
cv4 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
                      analyzer='word',max_df=.55,min_df=5)


v4 = cv4.fit_transform(data_reverse['headnote'])

In [None]:
cv3.get_feature_names()

In [None]:
cv4.get_feature_names()

In [None]:
word_count = 0
for word in cv3.get_feature_names():
    if word in cv4.get_feature_names():
        word_count+= 1 
    continue 
    
print(f"the number of word vectorizers have in common is: {word_count}")
print(f"There is a {word_count} percent overlap between the frequent words in both types of cases")

In [None]:
#Count Vectorizer for text extraction of the headnote phrases for the opinion text
cv5 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
           ngram_range=(2,3),max_df=.55,min_df = 5)


v5 = cv5.fit_transform(data_affirm['text'])

In [None]:
cv5.get_feature_names()

In [None]:
cv6 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
           ngram_range=(2,3),max_df=.55,min_df = 5)


v6 = cv6.fit_transform(data_reverse['text'])

In [None]:
cv6.get_feature_names()

In [None]:
# Let'see how many words are in common 
count_ngram_2 = 0 

for phrase in cv5.get_feature_names():
    if phrase in cv6.get_feature_names():
        count_ngram_2+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {count_ngram_2}")
print(f"There is a {count_ngram_2} percent overlap between the frequent words in both types of cases")

In [None]:
cv7 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
           analyzer='word',max_df=.55,min_df = 5)

v7 = cv7.fit_transform(data_affirm['text'])

In [None]:
cv8 = CountVectorizer(lowercase=True,stop_words='english',max_features=50,
           analyzer='word',max_df=.55,min_df = 5)

v8 = cv8.fit_transform(data_reverse['text'])

In [None]:
# Let'see how many words are in common 
count_word_2 = 0 

for phrase in cv7.get_feature_names():
    if phrase in cv8.get_feature_names():
        count_word_2+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {count_word_2}")
print(f"There is a {count_word_2} percent overlap between the frequent words in both types of cases")

# this section, we are going to run the TfidfVectorizer on the affirm and reverse cases: see if there are unique phrases between the headnotes

In [None]:
tf1 = TfidfVectorizer(stop_words='english',max_features=50,
                      ngram_range=(2,3),max_df=.55,min_df = 5)

v_ = tf1.fit_transform(data_affirm['headnote'])

In [None]:
tf2 = TfidfVectorizer(stop_words='english',max_features=50,
                      ngram_range=(2,3),max_df=.55,min_df = 5)

v2_ = tf2.fit_transform(data_reverse['headnote'])

In [None]:
# Let'see how many words are in common 
tf_ngram = 0 

for phrase in tf1.get_feature_names():
    if phrase in tf2.get_feature_names():
        tf_ngram+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {tf_ngram}")
print(f"There is a {tf_ngram} percent overlap between the frequent words in both types of cases")

In [None]:
tf3 = TfidfVectorizer(stop_words='english',max_features=50,
                      ngram_range=(2,3),max_df=.55,min_df = 5)

v3_ = tf3.fit_transform(data_affirm['text'])

In [None]:
tf4 = TfidfVectorizer(stop_words='english',max_features=50,
                      ngram_range=(2,3),max_df=.55,min_df = 5)

v4_ = tf4.fit_transform(data_reverse['text'])

In [None]:
# Let'see how many words are in common 
tf_ngram_text= 0 

for phrase in tf3.get_feature_names():
    if phrase in tf4.get_feature_names():
        tf_ngram_text+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {tf_ngram_text}")
print(f"There is a {tf_ngram_text} percent overlap between the frequent words in both types of cases")

In [None]:
tf5 = TfidfVectorizer(stop_words='english',max_features=50,
                      analyzer='word',max_df=.55,min_df = 5)

v5_ = tf5.fit_transform(data_affirm['headnote'])

In [None]:
tf6 = TfidfVectorizer(stop_words='english',max_features=50,
                      analyzer='word',max_df=.55,min_df = 5)

v6_ = tf6.fit_transform(data_reverse['headnote'])

In [None]:
# Let'see how many words are in common 
tf_word= 0 

for phrase in tf5.get_feature_names():
    if phrase in tf6.get_feature_names():
        tf_word+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {tf_word}")
print(f"There is a {tf_word} percent overlap between the frequent words in both types of cases")

In [None]:
tf7 = TfidfVectorizer(stop_words='english',max_features=50,
                      analyzer='word',max_df=.55,min_df = 5)

v7 = tf7.fit_transform(data_affirm['text'])

In [None]:
tf8 = TfidfVectorizer(stop_words='english',max_features=50,
                      analyzer='word',max_df=.55,min_df = 5)

v8_ = tf8.fit_transform(data_reverse['text'])

In [None]:
# Let'see how many words are in common 
tf_word_text= 0 

for phrase in tf7.get_feature_names():
    if phrase in tf8.get_feature_names():
        tf_word_text+= 1 
    continue 
    
print(f"the number of ngrams both vectorizers have in common is: {tf_word_text}")
print(f"There is a {tf_word_text} percent overlap between the frequent words in both types of cases")

## Here we are going to perform Clustering based on one of the methods that gave us the more unique features

- First we are going to do an elbow plot for KMeans ++ 
- then with that K, we are also going to use AgglomerativeClustering to see how the different algs group the cases.

In [10]:
data_.to_csv('checking.csv',index=False)

In [11]:
data_['case status'].value_counts()

affirm                313
reverse               238
partially reversed     52
partially reverse       5
reversed                5
0.0                     4
Affirm                  4
Name: case status, dtype: int64

In [None]:
word_feats = pd.DataFrame(v2.toarray(),columns=cv2.get_feature_names())