In [None]:
# Install this library to run the functions offered by LexNLP frame work
#pip install --user lexnlp

In [None]:
import os
import re
import sys
import nltk
import glob
import random
import numpy as np
import pandas as pd
import en_core_web_sm
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from itertools import chain 
nlp = en_core_web_sm.load()
from nltk.corpus import wordnet
from operator import itemgetter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.datasets import load_files
from nltk.stem.porter import PorterStemmer
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

In [None]:
# Importing specific functions from LexNLP 
import lexnlp.extract.en.acts
import lexnlp.extract.en.dates
import lexnlp.extract.en.courts
import lexnlp.extract.en.trademarks
import lexnlp.extract.en.regulations
import lexnlp.extract.en.entities.nltk_re
from lexnlp.extract.en.addresses import address_features

In [None]:
def cleaning_text(raw_text):
    '''
    Function: Basic text cleaning, eliminating any text that contains date and time.
    '''
    text_string = raw_text.replace('\n', '.')
    text_string = text_string.replace('..', '.')
    text_string  = re.sub(r'[^a-zA-Z\.]', ' ', text_string)
    
    content_salutations_removed = ' '
    salutations = ["Ms", "Mr", "mr", "ms"]
    temp_salutations_removed = ' '.join([word for word in text_string.split() if word not in salutations])
    content_salutations_removed = content_salutations_removed + temp_salutations_removed
    def is_valid_date(date_str):
        try:
            parser.parse(date_str)
            return True
        except:
            return False
    content_dates_removed = ' '.join([w for w in content_salutations_removed.split() if not is_valid_date(w)])
    content_lower_cased = content_dates_removed.lower()
    return content_lower_cased

In [None]:
def stop_lemma(raw_text):
    '''
    Function: Lemmatization and Stop word removal using SpaCy library. 
    In addition remove initials in front of names eg: J. Carry Kriston --> Carry Kriston.
    '''
    content_lower_cased =  cleaning_text(raw_text)
    content_with_stopwords = nlp(content_lower_cased)
    content_without_stopwords = ' '.join([str(token) for token in content_with_stopwords if not token.is_stop])

    content_without_lemmatized = nlp(content_without_stopwords)
    content_lemmatized = ' '.join([str(token.lemma_) for token in content_without_lemmatized])
    
    preprocessed_text = re.sub(r'\b\w{1}\b', '', content_lemmatized)
    text = re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', preprocessed_text))
    clean_text = re.sub(r'(\s+\.+)+', ".", text)

    return clean_text

In [None]:
def _createsentences(raw_text):
    '''
    Function: Remove salutations and align sentence periods for accurate sentence tokenization in the summarization phase. 
    Remove chosen Named Entities from the document like Name, Organization, Geo-political and Ordinal entities.
    '''
    text = stop_lemma(raw_text)
    punkt_params = PunktParameters()
    punkt_params.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'ms', 'prof', 'mt', 'inc', 'i.e', 'e.g', 
                                     'U.S.C.', 'U.S.', 'i.e', 'viz', 'Id.', 'Act'])
    sentence_splitter = PunktSentenceTokenizer(punkt_params)
    text = re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', text))
    text_unprocessed = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')
    text_unprocessed = text_unprocessed.replace('\n', ' . ')
    unprocessed_sentences = sentence_splitter.tokenize(text_unprocessed)
    
    for ndx, sentence in enumerate(unprocessed_sentences):
        sentence = sentence.replace('? " ', '?" ').replace('! " ', '!" ').replace('. " ', '." ')
        sentence = sentence[:-2] if (sentence.endswith(' .') or sentence.endswith(' . ')) else sentence
        unprocessed_sentences[ndx] = sentence
 
    filter_sentences = [i for i in range(len(unprocessed_sentences))
                                if len(unprocessed_sentences[i].replace('. ', '').split(' ')) >= 8]
    unprocessed_sentences = [unprocessed_sentences[i] for i in filter_sentences]
    processed_sentences = ''.join(unprocessed_sentences)

    text_no_namedentities = []
    text_NER = nlp(processed_sentences)
    Entity_Labels = ['PERSON','ORG', 'GPE', 'LOC', 'ORDINAL']
    ents = [e.text for e in text_NER.ents if e.label_ in Entity_Labels]
    for item in text_NER:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    
    text_without_entities = " ".join(text_no_namedentities)

    Ready_text = re.sub(' +', ' ', text_without_entities)
    Ready_text = re.sub('\.','. ',Ready_text)
    Ready_text = re.sub(r'\b\w{1}\b', '', Ready_text)
    return Ready_text

In [None]:
def court_data_removal(Content):
    '''
    Function: Find out presence of any legal entities (Acts, Regulations, Trademarks and Court Names) 
    from the document and eliminate it.
    '''
    clean_content = _createsentences(Content)
    TextWithoutComp = clean_content
    companies = [str(comp).lower() for comp in list(lexnlp.extract.en.entities.nltk_re.get_companies(TextWithoutComp))]
    if companies:
        Start_idx, End_idx = [], []
        for comp in companies:
            Name_Indices =  comp.split(',')
            Start_idx.append(int(Name_Indices[1].strip(' (')))
            End_idx.append(int(Name_Indices[2].strip(')')))

        Strip_Indices = []
        for begin, end in zip(Start_idx, End_idx):
            Strip_Indices.append([begin, end])

        lst = list(TextWithoutComp)
        for idx in Strip_Indices[::-1]: 
            del lst[slice(*idx)]
        text = ''.join(lst)
        TextWithoutComp = re.sub(' +', ' ', text)
            
    TextWithoutActs = TextWithoutComp
    acts = lexnlp.extract.en.acts.get_act_list(TextWithoutActs)
    act_from_data = []
    if acts:
        for act in acts:
            Name_Act = act.get('value').lower()
            act_from_data.append(Name_Act)
            TextWithoutActs = ''.join(TextWithoutActs.split(Name_Act))

    TextWithoutTrademarks = TextWithoutActs
    trademarks = list(lexnlp.extract.en.trademarks.get_trademarks(TextWithoutTrademarks))
    if trademarks:
        for trademark in trademarks:
            TextWithoutTrademarks = ''.join(TextWithoutTrademarks.split(trademark.lower()))

    TextWithoutRegulations = TextWithoutTrademarks
    regulations = [x[1] for x in list(lexnlp.extract.en.regulations.get_regulations(clean_content))]
    if regulations:
        for reg in regulations:
            TextWithoutRegulations = ''.join(TextWithoutRegulations.split(reg.lower()))

    TextWithoutCourtName = TextWithoutRegulations
    data_of_court = []
    for court_data in court_name_alias:
        if clean_content.find(court_data) != -1:
            data_of_court.append(court_data)
            TextWithoutCourtName = ''.join(TextWithoutCourtName.split(court_data))

    return TextWithoutCourtName

In [None]:
def get_court_name():
    '''
    Function: Fetching court names from the Unites States court database maintained by LexNLP. 
    '''
    court_df = pd.read_csv("https://raw.githubusercontent.com/LexPredict/lexpredict-legal-dictionary/1.0.5/en/legal/us_courts.csv")
    court_name, court_alias = [], []
    for _, row in court_df.iterrows():
        court_name.append(row["Court Name"])
        court_alias.extend(row["Alias"].split(";"))
    court_name_alias = [x.lower() for x in list(chain(court_name, court_alias))]
    return court_name_alias

In [None]:
def pre_process_pipe(input_data_dir):
    '''
    Function: Final function that takes the actual dataset and applies all the function to produce the cleanest data possible.
    '''
    
    data = load_files(input_data_dir, encoding="utf-8", decode_error="replace")
    labels, counts = np.unique(data.target, return_counts=True)
    labels_str = np.array(data.target_names)[labels]
    
    raw_text, case_labels, file_name = [], [], []
    num_of_files = len(data.data)
    for i in range(num_of_files):
        print('Completed for {} files'.format(i))
        raw_text.append(court_data_removal(data.data[i])) 
        case_labels.append(data.target_names[data.target[i]])
        file_name.append(data.filenames[i])
        
    text_Cleaned = {'Case_document' : raw_text, 'Case_label' : case_labels, 'Case_filename' : file_name}
    my_df = pd.DataFrame(text_Cleaned)
    my_df.to_csv('Thesis - Dataset and Transformations/transform - post preprocessing/fully_preprocessed_dataset.csv', index=False, header=True)

In [None]:
if __name__=='__main__':
    court_name_alias = get_court_name()
    input_data_dir = 'Thesis - Dataset and Transformations/actual dataset/semi_preprocessed_cases.zip'
    pre_process_pipe(input_data_dir)