In [15]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2


In [16]:
df = pd.read_csv("Datasets/Classification.csv")

In [17]:
classes = df["class"].unique()
classes

array(['sellerDetails', 'misc', 'amount', 'buyerDetails',
       'invoiceDetails'], dtype=object)

In [18]:
for c in classes:
    print(c, " : ", len(df[df["class"]==c]))

sellerDetails  :  41
misc  :  545
amount  :  71
buyerDetails  :  47
invoiceDetails  :  38


In [19]:
d1 = df[df["class"]=="invoiceDetails"]
d2 = df[df["class"]=="sellerDetails"]
d3 = df[df["class"]=="buyerDetails"]
d4 = df[df["class"]=="amount"]
d5 = df[df["class"]=="misc"]

In [20]:
d1 = shuffle(d1, random_state=30)
d1.reset_index(inplace=True)
d1.drop(columns=["index"], inplace=True)

d2 = shuffle(d2, random_state=30)
d2.reset_index(inplace=True)
d2.drop(columns=["index"], inplace=True)

d3 = shuffle(d3, random_state=30)
d3.reset_index(inplace=True)
d3.drop(columns=["index"], inplace=True)

d4 = shuffle(d4, random_state=30)
d4.reset_index(inplace=True)
d4.drop(columns=["index"], inplace=True)

d5 = shuffle(d5, random_state=30)
d5.reset_index(inplace=True)
d5.drop(columns=["index"], inplace=True)

In [21]:
d1 = d1.loc[:35, :]
d2 = d2.loc[:35, :]
d3 = d3.loc[:35, :]
d4 = d4.loc[:35, :]
d5 = d5.loc[:35, :]

In [22]:
DF = pd.concat([d1, d2, d3, d4, d5])
DF = shuffle(DF, random_state=30)

In [23]:
DF

Unnamed: 0.1,Unnamed: 0,conf,text,x,y,imageName,class
1,500,94.866927,invoice for tally xcelerator service,0.338824,0.723333,89.jpeg,buyerDetails
24,405,89.631481,customer monthly summary report hexa developme...,0.449375,0.225200,67.jpeg,invoiceDetails
27,733,89.354141,"name karnataka, of supply karnataka descriptio...",0.338824,0.723333,46.jpeg,misc
4,296,11.005211,6250.00),1.342339,1.438266,80.jpeg,amount
22,245,38.458063,bm =. cee are nee i nd,0.348387,0.680445,14.jpeg,misc
...,...,...,...,...,...,...,...
32,442,93.609360,"cgt 390.00 total 7,280.00 settlement details: ...",0.470347,1.489394,60.jpeg,amount
9,466,94.730911,"tally solutions private limited, kaavya towers...",0.449012,0.225200,40.jpeg,sellerDetails
29,196,96.027150,sms mobile internet,0.276724,1.097429,7.jpeg,misc
21,679,86.452645,29aoupb3384a1zw code: karnataka 29aqupb3384a1z...,0.338824,0.723333,70.jpeg,misc


In [24]:
for c in classes:
    print(c, " : ", len(DF[DF["class"]==c]))

sellerDetails  :  36
misc  :  36
amount  :  36
buyerDetails  :  36
invoiceDetails  :  36


In [25]:
DF['id'] = DF['class'].factorize()[0]
idDF = DF[['class', 'id']].drop_duplicates()

class_to_id = dict(idDF.values)
id_to_class = dict(idDF[['id', 'class']].values)

DF.head()

Unnamed: 0.1,Unnamed: 0,conf,text,x,y,imageName,class,id
1,500,94.866927,invoice for tally xcelerator service,0.338824,0.723333,89.jpeg,buyerDetails,0
24,405,89.631481,customer monthly summary report hexa developme...,0.449375,0.2252,67.jpeg,invoiceDetails,1
27,733,89.354141,"name karnataka, of supply karnataka descriptio...",0.338824,0.723333,46.jpeg,misc,2
4,296,11.005211,6250.00),1.342339,1.438266,80.jpeg,amount,3
22,245,38.458063,bm =. cee are nee i nd,0.348387,0.680445,14.jpeg,misc,2


In [26]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2))

features = tfidf.fit_transform(DF.text).toarray()
labels = DF.id
print("Each of the %d texts is represented by %d features" %(features.shape))


Each of the 180 texts is represented by 138 features


In [28]:
N = 3
for c, id in sorted(class_to_id.items()):
    features_chi2 = chi2(features, labels == id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("Class----> %s:" %(c))
    print("     Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
    print("     Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))
    print("\n")


Class----> amount:
     Most Correlated Unigrams are: words, amount, 00
     Most Correlated Bigrams are: hsn sac, amount in, in words


Class----> buyerDetails:
     Most Correlated Unigrams are: name, to, buyer
     Most Correlated Bigrams are: to tally, tally solutions, solutions pvt


Class----> invoiceDetails:
     Most Correlated Unigrams are: 2022, date, 2023
     Most Correlated Bigrams are: invoice no, invoice date, 12 2022


Class----> misc:
     Most Correlated Unigrams are: solutions, cgst, pvt
     Most Correlated Bigrams are: solutions pvt, tally solutions, pvt ltd


Class----> sellerDetails:
     Most Correlated Unigrams are: pvt, mail, com
     Most Correlated Bigrams are: pvt ltd, india pvt, 3rd floor


