At first I just want to read the files and understand what I'm working with.

In [1]:
import pandas as pd
import pickle
from datetime import datetime

In [32]:
with open('expired_invoices.txt', 'r') as f:
    expired_ids = f.read().split(', ')
    expired_ids = set(map(int, expired_ids))
    
print("Expired Invoice IDs:", expired_ids)

Expired Invoice IDs: {383235, 371205, 352391, 385290, 343695, 381457, 368913, 389528, 397723, 366751, 323231, 325156, 381476, 379687, 319405, 330931, 326452, 322229, 356532, 349879, 367288, 313012, 352442, 315960, 394428, 326649, 331193, 379961, 383681, 325063, 356552, 340299, 305869, 392657, 343254, 377307, 348894, 369378, 307175, 377960, 391273, 338547, 337140, 347510, 351096, 340601, 378746, 379387, 331902, 363263}


In [33]:
with open('invoices_new.pkl', 'rb') as f:
    invoices = pickle.load(f)

print("Expired Invoice IDs:", expired_ids)

Expired Invoice IDs: {383235, 371205, 352391, 385290, 343695, 381457, 368913, 389528, 397723, 366751, 323231, 325156, 381476, 379687, 319405, 330931, 326452, 322229, 356532, 349879, 367288, 313012, 352442, 315960, 394428, 326649, 331193, 379961, 383681, 325063, 356552, 340299, 305869, 392657, 343254, 377307, 348894, 369378, 307175, 377960, 391273, 338547, 337140, 347510, 351096, 340601, 378746, 379387, 331902, 363263}


In [51]:
class InvoiceProcessor:
    def __init__(self, invoices_file, expired_invoices_file):
        self.invoices_file = invoices_file
        self.expired_invoices_file = expired_invoices_file
        self.invoices = None
        self.expired_ids = set()

    def load_data(self):
        # Load expired invoice IDs
        with open(self.expired_invoices_file, 'r') as f:
            expired_ids = f.read().split(', ')
            self.expired_ids = set(map(int, expired_ids))

        # Load invoice data from pickle file
        with open(self.invoices_file, 'rb') as f:
            self.invoices = pickle.load(f)

    def safe_int(self, value, default=0):
        try:
            return int(value)
        except (ValueError, TypeError):
            return default

    def process_data(self):
        records = []
        type_conversion = {0: 'Material', 1: 'Equipment', 2: 'Service', 3: 'Other'}
        
        for invoice in self.invoices:
            invoice_id = self.safe_int(invoice.get('id'))
            created_on = pd.to_datetime(invoice.get('created_on'), errors='coerce')
            invoice_items = invoice.get('items', [])
            invoice_total = sum(self.safe_int(item['item'].get('unit_price')) * self.safe_int(item.get('quantity')) for item in invoice_items)
            is_expired = invoice_id in self.expired_ids

            for item in invoice_items:
                invoiceitem_id = self.safe_int(item['item'].get('id'))
                invoiceitem_name = item['item'].get('name')
                item_type = type_conversion.get(self.safe_int(item['item'].get('type')), 'Unknown')
                unit_price = self.safe_int(item['item'].get('unit_price'))
                quantity = self.safe_int(item.get('quantity'))
                total_price = unit_price * quantity
                percentage_in_invoice = total_price / invoice_total if invoice_total else 0

                records.append({
                    'invoice_id': invoice_id,
                    'created_on': created_on,
                    'invoiceitem_id': invoiceitem_id,
                    'invoiceitem_name': invoiceitem_name,
                    'type': item_type,
                    'unit_price': unit_price,
                    'total_price': total_price,
                    'percentage_in_invoice': percentage_in_invoice,
                    'is_expired': is_expired
                })
                
        return pd.DataFrame(records)

    def transform_and_sort_data(self, df: pd.DataFrame):
        sorted_df = df.sort_values(by=['invoice_id', 'invoiceitem_id'], ascending=True)
        return sorted_df



In [52]:
# Usage
invoice_processor = InvoiceProcessor('invoices_new.pkl', 'expired_invoices.txt')
invoice_processor.load_data()
processed_df = invoice_processor.process_data()
sorted_df = invoice_processor.transform_and_sort_data(processed_df)
sorted_df

Unnamed: 0,invoice_id,created_on,invoiceitem_id,invoiceitem_name,type,unit_price,total_price,percentage_in_invoice,is_expired
256,0,2019-05-24,100942,ii_100942,Material,117,468,0.147355,False
389,0,2019-09-22,104400,ii_104400,Material,188,1504,0.363285,False
477,0,2019-08-24,105522,ii_105522,Equipment,154,1232,0.174060,False
390,0,2019-09-22,106905,ii_106905,Equipment,108,540,0.130435,False
106,0,2019-06-25,109190,ii_109190,Service,189,1134,0.210273,False
...,...,...,...,...,...,...,...,...,...
392,395841,2019-01-13,127224,ii_127224,Other,167,1336,0.220099,False
399,395841,2019-01-13,147134,ii_147134,Material,170,170,0.028007,False
395,395841,2019-01-13,151324,ii_151324,Other,119,952,0.156837,False
398,395841,2019-01-13,176476,ii_176476,Equipment,195,390,0.064250,False


In [53]:
sorted_df.to_csv('processed_invoices.csv', index=False)