The following is for classifying PDF documents by extracting text from them using Tesseract OCR and then training a Naive Bayes classifier on the extracted text. The classifier uses the TF-IDF (Term Frequency-Inverse Document Frequency) representation of the text data to make predictions. The categories are Mortage, Trust Deed and Deed

In [1]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import os
import pandas as pd

In [2]:
def pdf_to_text(pdf_path):
    img = convert_from_path(pdf_path)
    text = ""
    for i in img:
        text = text + pytesseract.image_to_string(i)
    return text      

In [3]:
pdf_dir = r"C:\Users\Sanchana\Desktop\MLP\USAPropertyDocuments"
pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith('.pdf')]

In [4]:
data = {'filename':[], 'text':[], 'label':[]}

In [5]:
for p in pdf_files:
    fname = os.path.basename(p)
    extext= pdf_to_text(p)
    
    if fname.startswith('M'):
        label='Mortage'
    elif fname.startswith('T'):
        label = 'TrustDeed'
    else:
        label = 'Deed'
        
    data['filename'].append(fname)
    data['text'].append(extext)
    data['label'].append(label)
    

In [6]:
df = pd.DataFrame(data)

In [7]:
df.drop(['filename'],axis=1)

Unnamed: 0,text,label
0,Springside Mortgage oeyr Mortgage Statement\n\...,Mortage
1,Springside Mortgage Mortgage Statement\nCuston...,Mortage
2,— THIS IS A CERTIFIED COPY\nWationsl City Bank...,Mortage
3,Southfield Bank\n\nMORTGAGE AGREEMENT\n\nParti...,Mortage
4,(A)\n\n(B)\n\n(Cc)\n\n(D)\n\n(E)\n\n(F)\n\n(G)...,Mortage
5,Loan Estimate vale Tome JO year\n\nEa\nBue Le ...,Mortage
6,PEATHERED NEST MORTGAGE COMPANY\nOF CANADA\n\n...,Mortage
7,"Prepared By After Recording, Please Retum To\n...",Deed
8,(Rs. 200 stamp paper)\n\nPARTNERSHIP DEED\n\nT...,Deed
9,DEED OF PARTNERSHIP\n\nThe Deed of Partnership...,Deed


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],df['label'],test_size=0.2)

In [10]:
vec = TfidfVectorizer()
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

In [11]:
mnb = MultinomialNB()
mnb.fit(X_train_vec,Y_train)
y_pred_mnb = mnb.predict(X_test_vec)
print(metrics.classification_report(Y_test, y_pred_mnb))

              precision    recall  f1-score   support

        Deed       1.00      0.33      0.50         3
     Mortage       1.00      1.00      1.00         1
   TrustDeed       0.33      1.00      0.50         1

    accuracy                           0.60         5
   macro avg       0.78      0.78      0.67         5
weighted avg       0.87      0.60      0.60         5



In [18]:
svc = SVC(kernel='linear',probability=True)
svc.fit(X_train_vec,Y_train)
y_pred_svc = svc.predict(X_test_vec)
print(metrics.classification_report(Y_test,y_pred_svc))

              precision    recall  f1-score   support

        Deed       1.00      1.00      1.00         3
     Mortage       1.00      1.00      1.00         1
   TrustDeed       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [13]:
lr = LogisticRegression()
lr.fit(X_train_vec,Y_train)
y_pred_lr = lr.predict(X_test_vec)
print(metrics.classification_report(Y_test,y_pred_svc))

              precision    recall  f1-score   support

        Deed       1.00      1.00      1.00         3
     Mortage       1.00      1.00      1.00         1
   TrustDeed       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [14]:
rf = RandomForestClassifier()
rf.fit(X_train_vec,Y_train)
y_pred_rf=rf.predict(X_test_vec)
print(metrics.classification_report(Y_test,y_pred_rf))

              precision    recall  f1-score   support

        Deed       1.00      0.33      0.50         3
     Mortage       0.00      0.00      0.00         1
   TrustDeed       0.25      1.00      0.40         1

    accuracy                           0.40         5
   macro avg       0.42      0.44      0.30         5
weighted avg       0.65      0.40      0.38         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
probmnb = mnb.predict_proba(X_test_vec)
print(probmnb)

[[0.32676295 0.23902577 0.43421129]
 [0.40760945 0.22835887 0.36403169]
 [0.33668971 0.24938478 0.4139255 ]
 [0.15464032 0.21720975 0.62814993]
 [0.24932756 0.37539552 0.37527692]]


In [19]:
probsvc = svc.predict_proba(X_test_vec)
print(probsvc)

[[0.81744916 0.06523673 0.11731411]
 [0.95090362 0.02560219 0.02349419]
 [0.85329864 0.058023   0.08867836]
 [0.03424615 0.15590765 0.80984619]
 [0.18786482 0.50325061 0.30888456]]


In [20]:
problr = lr.predict_proba(X_test_vec)
print(problr)

[[0.34935997 0.27135077 0.37928926]
 [0.41164558 0.26275479 0.32559963]
 [0.35951776 0.27057521 0.36990703]
 [0.18171894 0.27007762 0.54820344]
 [0.23438263 0.42089554 0.34472183]]


In [22]:
probrf = rf.predict_proba(X_test_vec)
print(probrf)

[[0.32 0.18 0.5 ]
 [0.65 0.1  0.25]
 [0.3  0.21 0.49]
 [0.06 0.1  0.84]
 [0.16 0.39 0.45]]
