In [None]:
import re
import ast
import itertools
import pandas as pd

In [None]:
def docID_Generator(df):
    '''
    Function: As explained in the report we track the documents from two types of same dataset i.e raw data and pre-processed
    data. For tracking this we create a document ID to map the legal details from one document in raw data to the same document
    in the pre-processed data. 
    
    This document ID is of the form 'CASE_LABEL'+'CASE_NO.txt' for eg: 9case112.txt.
    '''
    docID = []
    for idx, row in df.iterrows():
        docID.append(''.join(row['Case_filename'].split('\\')[-2:]))
    df["DocID"] = docID
    return df


def legal_data_concatenation(preprocessed_data, raw_data):
    '''
    Function: The document ID generated for each of the dataset above is traced and the matching documents are found and 
    eventually the legal details from the raw document is assigned to a column next to the preprocessed document.
    '''
    
    Unidentified_Docs = []
    Documents_With_LegalDetails = pd.DataFrame(columns=["DocID", "Preprocessed_Text", "Case_Label", "Legal_Details", "File_Name"])
    for idx, row in preprocessed_data.iterrows():
        Legal_Details = ''
        print('Completed {} number of rows'.format(idx))
        docID = row['DocID']
        if len(raw_data.loc[raw_data['DocID'] == docID]) > 0:
            Act_Details = ', '.join(ast.literal_eval(raw_data.loc[raw_data['DocID'] == docID, 'ActsData'].iloc[0]))
            if len(Act_Details):
                Legal_Details = Legal_Details + ' ' + Act_Details
            Regulation_Details = ', '.join(ast.literal_eval(raw_data.loc[raw_data['DocID'] == docID, 'RegulationsData'].iloc[0]))
            if len(Regulation_Details):
                Legal_Details = Legal_Details + ' ' + Regulation_Details
        else:
            Legal_Details = ''
            Unidentified_Docs.append(docID)

        temp_df = pd.DataFrame({"DocID":docID,
                                "Preprocessed_Text":row['Case_document'],  
                                "Case_Label":row['Case_label'],
                                "Legal_Details":Legal_Details,
                                "File_Name":row['Case_filename']}, index=[0]
                               )

        Documents_With_LegalDetails = Documents_With_LegalDetails.append(temp_df, ignore_index=True, sort=True)
    
    return Documents_With_LegalDetails

In [None]:
if __name__ == '__main__':
    preprocessed_data = pd.read_csv('Thesis - Dataset and Transformations/transform - post preprocessing/fully_preprocessed_dataset.csv')
    raw_data = pd.read_csv('Thesis - Dataset and Transformations/transform - post legal data extraction/raw_dataset_legal_entities.csv')
    
    raw_data = docID_Generator(raw_data)
    preprocessed_data = docID_Generator(preprocessed_data)
    Documents_With_LegalDetails = legal_data_concatenation(preprocessed_data, raw_data)
    
    Documents_With_LegalDetails.to_csv('Thesis - Dataset and Transformations/transform - post legal data extraction/fully_preprocessed_with_legal_entites.csv', index=False, header=True)