In [29]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2


In [35]:
DF = pd.read_csv("Datasets/Classification2.csv")

In [36]:
classes = DF["class"].unique()
classes

array(['sellerDetails', 'buyerDetails', 'amount', 'invoiceDetails'],
      dtype=object)

In [37]:
for c in classes:
    print(c, " : ", len(DF[DF["class"]==c]))

sellerDetails  :  45
buyerDetails  :  42
amount  :  68
invoiceDetails  :  23


In [38]:
DF

Unnamed: 0.1,Unnamed: 0,conf,text,x,y,imageName,class
0,0,75.381990,"ie /, FIESTA VACATIONS PVT LTD Wf #18 Old No.1...",0.403564,0.155714,/home/aman/Documents/Tally/Git-Document-AI/Doc...,sellerDetails
1,3,79.497434,Buyer (Bill to) i MRS.DHANALAKSHMI,0.198812,0.364571,/home/aman/Documents/Tally/Git-Document-AI/Doc...,buyerDetails
2,4,70.364592,to) MRS.DHANALAKSHMI,0.330297,0.393714,/home/aman/Documents/Tally/Git-Document-AI/Doc...,buyerDetails
3,7,89.473671,‘Amount Chargeable (in words) ‘Indian Ru...,0.408317,1.448286,/home/aman/Documents/Tally/Git-Document-AI/Doc...,amount
4,8,75.805049,"Central Tax Amount Rate 16,235.83] 2.50%|",1.318416,1.441143,/home/aman/Documents/Tally/Git-Document-AI/Doc...,amount
...,...,...,...,...,...,...,...
173,1760,80.891939,SHIP TO NABENDU DAS 4919845222582,0.926667,0.526015,/home/aman/Documents/Tally/Git-Document-AI/Doc...,buyerDetails
174,1774,78.808909,Amount 600.00,1.203583,1.206364,/home/aman/Documents/Tally/Git-Document-AI/Doc...,amount
175,1835,94.625963,"LEVEL 1 LEVEL Amount 699.00 1,999.00",1.168841,1.206364,/home/aman/Documents/Tally/Git-Document-AI/Doc...,amount
176,1860,89.845624,Dr. B.Borooah Email: GST NO: 18ABLFS5299R1ZU G...,0.399608,0.170846,/home/aman/Documents/Tally/Git-Document-AI/Doc...,sellerDetails


In [39]:
for c in classes:
    print(c, " : ", len(DF[DF["class"]==c]))

sellerDetails  :  45
buyerDetails  :  42
amount  :  68
invoiceDetails  :  23


In [40]:
DF['id'] = DF['class'].factorize()[0]
idDF = DF[['class', 'id']].drop_duplicates()

class_to_id = dict(idDF.values)
id_to_class = dict(idDF[['id', 'class']].values)

DF.head()

Unnamed: 0.1,Unnamed: 0,conf,text,x,y,imageName,class,id
0,0,75.38199,"ie /, FIESTA VACATIONS PVT LTD Wf #18 Old No.1...",0.403564,0.155714,/home/aman/Documents/Tally/Git-Document-AI/Doc...,sellerDetails,0
1,3,79.497434,Buyer (Bill to) i MRS.DHANALAKSHMI,0.198812,0.364571,/home/aman/Documents/Tally/Git-Document-AI/Doc...,buyerDetails,1
2,4,70.364592,to) MRS.DHANALAKSHMI,0.330297,0.393714,/home/aman/Documents/Tally/Git-Document-AI/Doc...,buyerDetails,1
3,7,89.473671,‘Amount Chargeable (in words) ‘Indian Ru...,0.408317,1.448286,/home/aman/Documents/Tally/Git-Document-AI/Doc...,amount,2
4,8,75.805049,"Central Tax Amount Rate 16,235.83] 2.50%|",1.318416,1.441143,/home/aman/Documents/Tally/Git-Document-AI/Doc...,amount,2


In [41]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2))

features = tfidf.fit_transform(DF.text).toarray()
labels = DF.id
print("Each of the %d texts is represented by %d features" %(features.shape))


Each of the 178 texts is represented by 96 features


In [42]:
N = 3
for c, id in sorted(class_to_id.items()):
    features_chi2 = chi2(features, labels == id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("Class----> %s:" %(c))
    print("     Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
    print("     Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))
    print("\n")


Class----> amount:
     Most Correlated Unigrams are: total, amount, 00
     Most Correlated Bigrams are: hsn sac, amount in, in words


Class----> buyerDetails:
     Most Correlated Unigrams are: solutions, buyer, to
     Most Correlated Bigrams are: tally solutions, to tally, bill to


Class----> invoiceDetails:
     Most Correlated Unigrams are: 22, 2023, date
     Most Correlated Bigrams are: pvt ltd, amount in, in words


Class----> sellerDetails:
     Most Correlated Unigrams are: limited, company, india
     Most Correlated Bigrams are: amount in, in words, private limited


