In [None]:
import os
import re
import sys
import ast
import nltk
import glob
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.datasets import load_files

import lexnlp.extract.en.acts
import lexnlp.extract.en.dates
import lexnlp.extract.en.courts
import lexnlp.extract.en.trademarks
import lexnlp.extract.en.regulations
import lexnlp.extract.en.entities.nltk_re
from lexnlp.extract.en.addresses import address_features

In [None]:
def actsData_Cleaning(x):
    '''
    Function - Minor cleaning of Acts data to remove incomplete tokens.
    '''
    remove_list = ['act', 'the act']
    clean_x = list()
    for each_x in x:
        clean_x.append(" ".join(each_x.lower().split()))
    clean_x = list(filter(lambda i: i not in remove_list, clean_x))
    return list(set(clean_x))

def regData_Cleaning(x):
    '''
    Function - Minor cleaning of Regulations data to remove incomplete tokens.
    '''
    low_case_x = [i.lower() for i in x]
    return list(set(low_case_x))

def legal_data_extraction(path):
    '''
    Function: Captures the Acts and Regulations from the legal documents and creates a new column for each of them named
    RegulationsData and ActsData respectively.
    '''
    raw_dataset = pd.read_csv(path)
    
    DocDetails = pd.DataFrame(columns=['RegulationsData',  'ActsData',])
    for idx, row in raw_dataset.iterrows():
        Acts = list()
        clean_content = row['Case_document']
        acts_ = lexnlp.extract.en.acts.get_act_list(clean_content)
        for act in acts_:
            Acts.append(re.sub(r'[^A-Za-z0-9 ]+', '', act.get('value')))
        regulations_ = [re.sub('[\W_]+', '', x[1]) for x in list(lexnlp.extract.en.regulations.get_regulations(clean_content))]
        temp_doc_data = pd.DataFrame({'RegulationsData': [regulations_], 'ActsData':[Acts]}, index = [0])
        DocDetails = DocDetails.append(temp_doc_data, ignore_index=True, sort=False)

    raw_dataset['ActsData'] = pd.Series(DocDetails['ActsData'])
    raw_dataset['RegulationsData'] = pd.Series(DocDetails['RegulationsData'])

    raw_dataset['RegulationsData'] = list(raw_dataset['RegulationsData'].apply(lambda x: regData_Cleaning(x)))
    raw_dataset['ActsData'] = list(raw_dataset['ActsData'].apply(lambda x: actsData_Cleaning(x)))
    
    raw_dataset.to_csv('Thesis - Dataset and Transformations/transform - post legal data extraction/raw_dataset_legal_entities.csv', index=False, header=True)

In [None]:
if __name__ == '__main__':    
    path = 'Thesis - Dataset and Transformations/actual dataset/raw_data.csv'
    legal_data_extraction(path)