<a href="https://colab.research.google.com/github/Satyaram-k/satyaram_INFO5731_Spring2020/blob/main/Group_4-Caselaw_Access_Project/Project_Data_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# imports
import os
import requests
import zipfile
import certifi
import urllib3
from urllib3.exceptions import MaxRetryError
from tqdm import tqdm
import lzma
import json
import pandas as pd

In [None]:
API_URL = "https://api.case.law"
API_VERSION = "v1"
API_BULK_URL = API_URL + "/" + API_VERSION + "/" + "bulk"
DATA_DIR = '/content'
API_KEY = "30983208107a7033ff5df9e18faf9e2559afb528"
CVIOLET = '\33[35m'
CEND = '\33[0m'
CURL = '\33[4m'

def print_info(instruction):
    """
    Colorize print output for instructions
    """
    print(CVIOLET + instruction + CEND)

In [None]:
def get_cases_from_bulk(jurisdiction, data_format="json"):
    body_format = "xml" if data_format == "xml" else "text"
    bulk_url = API_BULK_URL + "/?body_format=%s&filter_type=jurisdiction" % body_format
    bulk_api_results = requests.get(bulk_url)
    found = False

    for jur in bulk_api_results.json()['results']:
        if jurisdiction in jur['file_name']:
            found = True
            break

    if not found:
        raise Exception("Jurisdiction not found. Please check spelling.")

    filename = os.path.join(DATA_DIR, jur['file_name'])

    http = urllib3.PoolManager(
        cert_reqs='CERT_REQUIRED',
        ca_certs=certifi.where())

    headers = {'AUTHORIZATION': 'Token {}'.format(API_KEY)}
    try:
        resp = http.request("GET", jur["download_url"],
                            preload_content=False,
                            headers=headers)
    except MaxRetryError as err:
        print("Writing of file was interrupted.\n\n%s" % err)
        return

    if resp.status != 200:
        raise Exception("Something went wrong.\n\n%s" % resp.data)

    print_info("downloading %s into ../data dir" % jur['file_name'])
    with open(filename, 'wb') as f:
        for chunk in tqdm(resp.stream(1024)):
            f.write(chunk)

    print_info("extracting %s into ../data dir" % jur['file_name'])
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

    print_info("Done.")

    decompressed_dir = filename.split('.zip')[0]
    return os.path.join(decompressed_dir + '/data/data.jsonl.xz')

In [None]:
def get_and_extract_from_bulk(jurisdiction, data_format="json"):
    dir_exists = False
    data_format = "xml" if data_format == "xml" else "text"  # xml or json

    for filename in os.listdir(DATA_DIR):
        if jurisdiction in filename and "-" + data_format in filename:
            if os.path.exists(os.path.join(DATA_DIR, filename + '/data/data.jsonl.xz')):
                dir_exists = True
                break

    if dir_exists:
        dir_path = os.path.join(DATA_DIR, filename)
    else:
        print_info("Getting compressed file for %s from /bulk endpoint.\nThis might take a while." % jurisdiction)
        dir_path = get_cases_from_bulk(jurisdiction=jurisdiction, data_format=data_format)

    compressed_file = os.path.join(DATA_DIR, dir_path)

    return compressed_file


In [None]:
def df_to_text(dataframe):
  text_list = []
  for i in range(0,len(dataframe)):
    opinion = ''
    for j in range(0,len(dataframe['casebody'][i]['data']['opinions'])):
      opinion = opinion + dataframe['casebody'][i]['data']['opinions'][j]['text']
    text_list.append(opinion)
  return text_list

In [None]:
### Collecting Texas Cases ###
compressed_file_tex = get_and_extract_from_bulk(jurisdiction="Texas", data_format="json")

# extracting the cases
cases_tex = []
print("File path:", compressed_file_tex)
with lzma.open(compressed_file_tex) as infile:
    for line in infile:
        record = json.loads(str(line, 'utf-8'))
        cases_tex.append(record)

print("Texas case count: %s" % len(cases_tex))

# loading the cases into a dataframe
tex_df = pd.DataFrame(cases_tex)

# printing dataframe info
tex_df.info()

In [None]:
### Collecting California Cases ###
compressed_file_cal = get_and_extract_from_bulk(jurisdiction="California", data_format="json")

# extracting the cases
cases_cal = []
print("File path:", compressed_file_cal)
with lzma.open(compressed_file_cal) as infile:
    for line in infile:
        record = json.loads(str(line, 'utf-8'))
        cases_cal.append(record)

print("California case count: %s" % len(cases_cal))

# loading the cases into a dataframe
cal_df = pd.DataFrame(cases_cal)

# printing dataframe info
cal_df.info()

In [None]:
##### Texas Groups Text #####

# Group one [2015-2016]
tex_1 = tex_df[(tex_df['decision_date'] >= '2015-01-01') & (tex_df['decision_date'] <= '2016-12-31')]
tex_1.reset_index(drop=True, inplace=True)
tex_group_1 = df_to_text(tex_1)
print('Number of cases in Texas in 2015 & 2016: ', len(tex_group_1))

# Group two [2013-2014]
tex_2 = tex_df[(tex_df['decision_date'] >= '2013-01-01') & (tex_df['decision_date'] <= '2014-12-31')]
tex_2.reset_index(drop=True, inplace=True)
tex_group_2 = df_to_text(tex_2)
print('Number of cases in Texas in 2013 & 2014: ', len(tex_group_2))

# Group three [2011-2012]
tex_3 = tex_df[(tex_df['decision_date'] >= '2011-01-01') & (tex_df['decision_date'] <= '2012-12-31')]
tex_3.reset_index(drop=True, inplace=True)
tex_group_3 = df_to_text(tex_3)
print('Number of cases in Texas in 2011 & 2012: ', len(tex_group_3))

# Group four [2009-2010]
tex_4 = tex_df[(tex_df['decision_date'] >= '2009-01-01') & (tex_df['decision_date'] <= '2010-12-31')]
tex_4.reset_index(drop=True, inplace=True)
tex_group_4 = df_to_text(tex_4)
print('Number of cases in Texas in 2009 & 2010: ', len(tex_group_4))

# Group five [2007-2008]
tex_5 = tex_df[(tex_df['decision_date'] >= '2007-01-01') & (tex_df['decision_date'] <= '2008-12-31')]
tex_5.reset_index(drop=True, inplace=True)
tex_group_5 = df_to_text(tex_5)
print('Number of cases in Texas in 2007 & 2008: ', len(tex_group_5))

In [None]:
##### California Groups Text #####

# Group one [2015-2016]
cal_1 = cal_df[(cal_df['decision_date'] >= '2015-01-01') & (cal_df['decision_date'] <= '2016-12-31')]
cal_1.reset_index(drop=True, inplace=True)
cal_group_1 = df_to_text(cal_1)
print('Number of cases in Californina in 2015 & 2016: ', len(cal_group_1))

# Group two [2013-2014]
cal_2 = cal_df[(cal_df['decision_date'] >= '2013-01-01') & (cal_df['decision_date'] <= '2014-12-31')]
cal_2.reset_index(drop=True, inplace=True)
cal_group_2 = df_to_text(cal_2)
print('Number of cases in Californina in 2013 & 2014: ', len(cal_group_2))

# Group three [2011-2012]
cal_3 = cal_df[(cal_df['decision_date'] >= '2011-01-01') & (cal_df['decision_date'] <= '2012-12-31')]
cal_3.reset_index(drop=True, inplace=True)
cal_group_3 = df_to_text(cal_3)
print('Number of cases in Californina in 2011 & 2012: ', len(cal_group_3))

# Group four [2009-2010]
cal_4 = cal_df[(cal_df['decision_date'] >= '2009-01-01') & (cal_df['decision_date'] <= '2010-12-31')]
cal_4.reset_index(drop=True, inplace=True)
cal_group_4 = df_to_text(cal_4)
print('Number of cases in Californina in 2009 & 2010: ', len(cal_group_4))

# Group five [2007-2008]
cal_5 = cal_df[(cal_df['decision_date'] >= '2007-01-01') & (cal_df['decision_date'] <= '2008-12-31')]
cal_5.reset_index(drop=True, inplace=True)
cal_group_5 = df_to_text(cal_5)
print('Number of cases in Californina in 2007 & 2008: ', len(cal_group_5))

In [None]:
# saving the data locally

import pickle

# Texas
open_file = open('/content/tex_group_1.pkl', "wb")
pickle.dump(tex_group_1, open_file)
open_file.close()

open_file = open('/content/tex_group_2.pkl', "wb")
pickle.dump(tex_group_2, open_file)
open_file.close()

open_file = open('/content/tex_group_3.pkl', "wb")
pickle.dump(tex_group_3, open_file)
open_file.close()

open_file = open('/content/tex_group_4.pkl', "wb")
pickle.dump(tex_group_4, open_file)
open_file.close()

open_file = open('/content/tex_group_5.pkl', "wb")
pickle.dump(tex_group_5, open_file)
open_file.close()

# California
open_file = open('/content/cal_group_1.pkl', "wb")
pickle.dump(cal_group_1, open_file)
open_file.close()

open_file = open('/content/cal_group_2.pkl', "wb")
pickle.dump(cal_group_2, open_file)
open_file.close()

open_file = open('/content/cal_group_3.pkl', "wb")
pickle.dump(cal_group_3, open_file)
open_file.close()

open_file = open('/content/cal_group_4.pkl', "wb")
pickle.dump(cal_group_4, open_file)
open_file.close()

open_file = open('/content/cal_group_5.pkl', "wb")
pickle.dump(cal_group_5, open_file)
open_file.close()

In [None]:
# # downloading data to local machine
# from google.colab import files
# files.download('/content/tex_group_1.pkl')
# files.download('/content/tex_group_2.pkl')
# files.download('/content/tex_group_3.pkl')
# files.download('/content/tex_group_4.pkl')
# files.download('/content/tex_group_5.pkl')
# files.download('/content/cal_group_1.pkl')
# files.download('/content/cal_group_2.pkl')
# files.download('/content/cal_group_3.pkl')
# files.download('/content/cal_group_4.pkl')
# files.download('/content/cal_group_5.pkl')