In [89]:
# Importing necessary libraries

import glob
import os
import pandas as pd
import numpy as np
from docx import Document
import re
import PyPDF2
import openpyxl
import spacy

In [90]:
# All the datasets i.e., the raw contract files are present in Final Contracts Directory
# The First step is to convert all these pdf, word files into text format.
# Path for output files i.e., files after conversion are saved in Text_Files Directory

file_path = 'Final Contracts//'
out_file_path='Text_Files//'

In [91]:
# All the pdf, word files in the directory are loaded into a list using os library

file_list = os.listdir(file_path)

In [92]:
# All files ending with .docx are stored into docx_list 
# All .pdf files are loaded into a pdf_list

docx_list = [file for file in file_list if file.endswith('.docx')]
pdf_list = [file for file in file_list if file.endswith('.pdf')]
docx_list

['Contract_FM_1.docx',
 'Contract_FM_10.docx',
 'Contract_FM_11.docx',
 'Contract_FM_13.docx',
 'Contract_FM_18.docx',
 'Contract_FM_19.docx',
 'Contract_FM_20.docx',
 'Contract_FM_21.docx',
 'Contract_FM_26.docx',
 'Contract_FM_3.docx',
 'Contract_FM_30.docx',
 'Contract_FM_32.docx',
 'Contract_FM_34.docx',
 'Contract_FM_35.docx',
 'Contract_FM_37.docx',
 'Contract_FM_38.docx',
 'Contract_FM_4.docx',
 'Contract_FM_5.docx',
 'Contract_FM_6.docx',
 'Contract_FM_7.docx',
 'Contract_FM_9.docx',
 'Contract_NO_1.docx',
 'Contract_NO_11.docx',
 'Contract_NO_12.docx',
 'Contract_NO_13.docx',
 'Contract_NO_14.docx',
 'Contract_NO_3.docx',
 'Contract_NO_5.docx',
 'Contract_NO_7.docx',
 'Contract_NO_9.docx',
 'Contract_PAN_1.docx',
 'Contract_PAN_10.docx',
 'Contract_PAN_11.docx',
 'Contract_PAN_2.docx',
 'Contract_PAN_20.docx',
 'Contract_PAN_21.docx',
 'Contract_PAN_22.docx',
 'Contract_PAN_23.docx',
 'Contract_PAN_3.docx',
 'Contract_PAN_4.docx',
 'Contract_PAN_5.docx',
 'Contract_PAN_6.docx'

In [93]:
# Converting all docx files to text
# First we create an instance for each document in the source path
# Replace .docx with .txt
# Then the text file is written into output directory
# While writing into text files, we are removing the white spaces.

for doc in docx_list:
    document = Document(file_path+doc)
    text_filename = doc.replace('.docx','.txt')
    file = open(out_file_path+text_filename , mode='w')
    for para in document.paragraphs:
        file.write(re.sub('[^A-Za-z0-9.() ]+', '', para.text))
    file.close()

In [94]:
# Converting all Pdf files to text
# First we create an instance for each document in the source path
# Replace .pdf with .txt
# Each pdf file is read using PyPDF2.PdfFileReader
# Then the text file is written into output directory
# While writing into text files, no of pages for each file is obtained using pdf.getNumpages. 
# And in each page, white spaces are removed

for doc in pdf_list:    
    text_filename = doc.replace('.pdf','_pdf.txt')
    pdf = PyPDF2.PdfFileReader(file_path+doc)
    file = open(out_file_path+text_filename , mode='w')
    for i in range(pdf.getNumPages()):
        file.write(re.sub('[^A-Za-z0-9.() ]+', '', pdf.getPage(i).extractText()))
    file.close()

In [97]:
# Combining text files to an excel

txt_list = os.listdir(out_file_path)

wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = 'txt'
ws['B1'] = 'file_name' 

for num,file in enumerate(txt_list):    
    f = open( out_file_path+file , 'rb')
    txt = f.read()
    ws['A'+ str(num+2)] = txt
    ws['B'+ str(num+2)] = file 
    f.close()
    
wb.save('contract_data.xlsx')
text_data = pd.read_excel("contract_data.xlsx")

In [98]:
# Searching for Force Majeure clause using regex

def extract_forcemajeure(text):
    fm = re.compile(r'force majeure')
    a = fm.search(text.lower())
    if a:
        return 'Yes'
    else:
        return 'No'
        

In [99]:
# Searching for key words like pandmeic/epidemic/disease inside a force majeure clause

def extract_pandemic(text):
    fm = re.compile(r'force majeure')
    pan = re.compile(r'pandemic|disease|epidemic|pandemics|epidemics')
    
    a = [i.start() for i in fm.finditer(text.lower())]
    b = pan.search(text.lower())
    
    if a:
        fm_idx_first = a[0]
        fm_idx_last = a[-1]
    else:
        fm_idx_first = -1
        fm_idx_last = -1
    if b:
        pan_idx = b.span()[1]
    else:
        pan_idx = -1
        
    if fm_idx_first == -1:
        return 'No'
    elif fm_idx_first < pan_idx < fm_idx_last:
        return 'Yes'
    else:
        return 'No'

In [104]:
# 
def ner_extract(df):
    
    #Loading the custom trained spacy model
    model_path = "C:\\Users\\Harika\\Documents\\Masters\\ML\\Project\\contract_ner_model\\"
    nlp = spacy.load(model_path)
    
    liability_pattern = ['Neither Party', 'Either Party', 'either party','neither party','No Party', 'no party', 'Neither the Authority nor the Operator']

    fm_clause = df['txt']
    fm_name = df['file_name']
    size  = len(fm_clause)
    liability_lst = [None]*size
    notice_lst = [None]*size
    termination_lst = [None]*size
    for row in range(0,len(fm_clause)):
        
        test_text = str(fm_clause[row])
        if test_text is not None:
            doc = nlp(test_text)
        else:
            continue
    #     print("Entities in '%s'" % test_text)
        for ent in doc.ents:
    #         print(ent.label_, ent.text)
            if ent.label_ == "Liability" and ent.text in liability_pattern:
                liability_lst[row] = ent.text
            elif ent.label_ == "Notification_type":
                notice_lst[row] = ent.text
            elif ent.label_ == "Termination_period":
                if 'months' in  ent.text:
                    months = int(re.findall(r'\d+',ent.text)[0])
                    termination_lst[row] = months*30
                #elif 'months' not in ent.text:
                    #termination_lst[row] = (re.findall(r'\d+',ent.text)[0])
                else:
                    continue
            else:
                continue
            
    dic ={'file_name':fm_name,'Force_Majeure':df['force_majeure'],'Pandemic_or_disease':df['pandemic_or_disease'],'Liability':liability_lst, 'Notification_type':notice_lst, 'Termination_period':termination_lst}            
    df_new = pd.DataFrame(dic)
    return df_new

In [105]:
from pandas import ExcelWriter
from pandas import ExcelFile

In [106]:
contract_df = pd.read_excel("contract_data.xlsx")
contract_df['force_majeure'] = contract_df['txt'].astype(str).apply(lambda x : extract_forcemajeure(x))
contract_df['pandemic_or_disease'] = contract_df['txt'].astype(str).apply(lambda x : extract_pandemic(x))

In [107]:
#Calling ner_extract function to apply trained model on the force majeure text
final_extract = ner_extract(contract_df)
print(contract_df)

final_extract.to_excel('Final_Extract.xlsx')

                                                  txt               file_name  \
0   SCHEDULE D FORM OF STATEMENT OF WORKStatement ...       Contract_FM_1.txt   
1   SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_FM_10.txt   
2   SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_FM_11.txt   
3   SCHEDULE D  FORM OF STATEMENT OF WORK  Stateme...  Contract_FM_12_pdf.txt   
4   SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_FM_13.txt   
..                                                ...                     ...   
84  SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_PAN_5.txt   
85  SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_PAN_6.txt   
86  SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_PAN_7.txt   
87  SCHEDULE D  FORM OF STATEMENT OF WORK  Stateme...  Contract_PAN_8_pdf.txt   
88  SCHEDULE D FORM OF STATEMENT OF WORKStatement ...      Contract_PAN_9.txt   

   force_majeure pandemic_o