In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import PyPDF2
import re
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
#BagOfWords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('omw-1.4')

In [3]:
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace 
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    return text
 

def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

def clause_extractor(file):
    pdf_reader = PyPDF2.PdfFileReader(file)
    pages = pdf_reader.getNumPages()
    start_symbol = "“"
    end_symbol = ".”"
    doc_content= ""
    
    for i in range(pages):
        page = pdf_reader.getPage(i)
        content = page.extract_text()
        footer = content.find("Downloaded on")
        doc_content = doc_content + content[:footer]
        
    clause_start = doc_content.find(start_symbol)
    if clause_start != -1:
        clause_end = doc_content.find(end_symbol)
        clause = doc_content[clause_start:clause_end+2]
        print("Clause Found")
        return clause

def verify_clause(clause):
    if "any dispute" or "dispute resolution" or "arbitration clause" in clause:
        print("Clause Verified")
        return clause
    else:
        print("Verification Failed")
        return None

In [4]:
#Model Loading
loaded_model = pickle.load(open("LR_model.sav", 'rb'))

#Vector Loading
transformer = TfidfTransformer()
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=pickle.load(open("feature.pkl","rb")))

#Initializing
snow = SnowballStemmer('english')
wl = WordNetLemmatizer()

In [5]:
#Input the PDF File:
file = open("S.B. Arbitration Application No. 1 2019.pdf", 'rb')

In [6]:
#Extracting the Dispute Resolution Clause
drclause= clause_extractor(file)

Clause Found


In [7]:
#Verifying the Dispute Resolution Clause
verified_clause = verify_clause(drclause)

Clause Verified


In [8]:
preprocess_text= finalpreprocess(verified_clause)
preprocess_text=[preprocess_text]

In [9]:
#Using the vec to transform
transformed_input = transformer.fit_transform(loaded_vec.fit_transform(preprocess_text))

In [10]:
y_predict = loaded_model.predict(transformed_input)
print(y_predict)

['Valid']
