In [1]:
import pickle
import pandas as pd

In [2]:
import numpy as np

In [3]:
from word2number import w2n

In [4]:
import inflect

In [5]:
class DataExtractor:
    def __init__(self,new_invoice_path, expired_invoice_path):
        self.new_invoice_path = new_invoice_path
        self.expired_invoice_path = expired_invoice_path
    def load(self):
        with open(self.new_invoice_path,'rb') as file:
            self.new_invoice = pickle.load(file)
        with open(self.expired_invoice_path,'r') as file:
            expired_invoices = file.read()
        self.expired_invoices = expired_invoices.split(",")
    def transform(self):
        final_df = pd.DataFrame(columns = ["invoice_id","created_on","invoiceitem_id","invoiceitem_name","type","unit_price","total_price","percentage_in_invoice","is_expired"])
        types = {"invoice_id":"int32","created_on":"datetime64[ns]", "invoiceitem_id":"int32","invoiceitem_name":"str", "type":"str","unit_price":"int32","total_price":"int32","percentage_in_invoice":"float32", "is_expired":"bool"}
        final_df.astype(types)
        for invoice in self.new_invoice:
            if "items" in list(invoice.keys()):
                for item_ in invoice["items"]:
                    new_row ={}
                    new_row["invoice_id"] = invoice['id']
                    new_row["created_on"] = invoice["created_on"]
                    item = item_["item"]
                    new_row["invoiceitem_id"] = item['id']
                    new_row["invoiceitem_name"] = item["name"]
                    new_row["type"] = item["type"]
                    new_row["unit_price"] = item["unit_price"]
                    if str(item_["quantity"])[0]  in ["1","2","3","4","5","6","7","8","9"]:
                        quantity = item_["quantity"]
                    elif str(item_["quantity"])[0] =="-":
                        # invalid number because it is negative, and I believe this may indicate that there was a typo in the text scanned
                        continue
                    else:

                        quantity = w2n.word_to_num(str(item_["quantity"]))

                    new_row["total_price"] = int(new_row["unit_price"]*quantity)
                    invoice_total = 0
                    for item_2 in invoice["items"]:
                        if str(item_2["quantity"])[0]  in ["1","2","3","4","5","6","7","8","9"]:
                            quantity = item_2["quantity"]
                        elif str(item_2["quantity"])[0] =="-":
                            # invalid number
                            continue
                        else:

                            quantity = w2n.word_to_num(item_2["quantity"])

                        invoice_total += item_2["item"]["unit_price"]*int(quantity)
                    new_row["percentage_in_invoice"] = float(new_row["total_price"]/invoice_total)
                    new_row["is_expired"] = str(new_row["invoice_id"]) in self.expired_invoices

                    final_df = pd.concat([final_df,pd.DataFrame([new_row],columns = list(final_df.columns))],ignore_index=True)
        final_df = final_df.sort_values(by = ["invoice_id","invoiceitem_id"], ascending = [True,True])
        return final_df





In [6]:
our_class = DataExtractor("invoices_new.pkl","expired_invoices.txt")

In [7]:
our_class.load()

In [8]:
final_df = our_class.transform()

In [9]:
final_df

Unnamed: 0,invoice_id,created_on,invoiceitem_id,invoiceitem_name,type,unit_price,total_price,percentage_in_invoice,is_expired
291,301695,2019-04-26,103215,ii_103215,3,135,945,0.374851,False
289,301695,2019-04-26,166227,ii_166227,1,118,708,0.280841,False
292,301695,2019-04-26,171394,ii_171394,3,128,128,0.050774,False
290,301695,2019-04-26,195625,ii_195625,3,148,740,0.293534,False
130,304245,2019-03-17,121446,ii_121446,2,158,632,0.128429,False
...,...,...,...,...,...,...,...,...,...
158,385290O,2019-06-29,168488,ii_168488,1,116,232,0.225243,False
87,397723O,2019-05-11,117999,ii_117999,1,196,392,0.212928,False
89,397723O,2019-05-11,121772,ii_121772,0,120,600,0.325910,False
86,397723O,2019-05-11,181613,ii_181613,0,161,483,0.262357,False


In [11]:
final_df.to_csv("final_data.csv",index=False)