In [1]:
!pip install transformers datasets tqdm

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, GPT2Config
from datasets import Dataset
from sklearn.model_selection import train_test_split
import os
import logging
import numpy as np
from tqdm import tqdm
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Part 1: Data Generation and Preparation

def generate_sme_expense_data(file_name, num_samples=500):
    categories = [
        'Retail', 'Manufacturing', 'IT Services', 'Consultancy',
        'Hospitality', 'Construction', 'Agriculture', 'Transport',
        'Healthcare', 'Education'
    ]

    descriptions = [
        "Monthly rent payment", "Purchase of raw materials",
        "Software license renewal", "Employee salaries",
        "Utility bills", "Advertising costs",
        "Equipment maintenance", "Staff training expenses",
        "Insurance premium", "Office supplies purchase",
        "Travel expenses", "Professional fees",
        "Inventory restocking", "Vehicle fuel costs",
        "Loan interest payment", "Depreciation of assets"
    ]

    tax_categories = [
        'Deductible Expense', 'Depreciable Asset', 'Non-Deductible Expense',
        'Partially Deductible', 'VAT Applicable', 'Capital Expenditure', 'Other'
    ]

    data = {
        'Business_Type': [],
        'Description': [],
        'Amount': [],
        'Tax_Category': []
    }

    for _ in range(num_samples):
        data['Business_Type'].append(random.choice(categories))
        data['Description'].append(random.choice(descriptions))
        data['Amount'].append(random.randint(1000, 500000))  # Amount in NPR
        data['Tax_Category'].append(random.choice(tax_categories))

    df = pd.DataFrame(data)
    df.to_excel(file_name, index=False)
    logging.info(f"Generated {num_samples} SME expense records and saved to {file_name}")

    return df

# Generate training and test data
train_df = generate_sme_expense_data('sme_expenses_train.xlsx', num_samples=2000)
test_df = generate_sme_expense_data('sme_expenses_test.xlsx', num_samples=500)

# Part 2: Model Training

model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
config.num_labels = 1  # For regression task
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

def preprocess_function(examples):
    full_text = [f"<|startoftext|>Business Type: {b}\nExpense: {e}\nAmount: {a} NPR\nTax Category: {t}<|endoftext|>"
                 for b, e, a, t in zip(examples['Business_Type'], examples['Description'],
                                       examples['Amount'], examples['Tax_Category'])]

    tokenized = tokenizer(full_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    tokenized['labels'] = tokenized['input_ids'].clone()

    for key in tokenized.keys():
        tokenized[key] = tokenized[key].tolist()

    return tokenized

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./finlytic_categorize_results",
    per_device_train_batch_size=32,  # Increase batch size
    per_device_eval_batch_size=64,
    num_train_epochs=3,  # Reduce number of epochs
    logging_dir="./finlytic_categorize_logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

logging.info("Starting model training...")
trainer.train()
logging.info("Model training completed.")

model_save_path = "./finlytic_categorize_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
logging.info(f"Model saved to {model_save_path}")

# Part 3: Expense Categorization and Advice Generation

def categorize_expense(business_type, description, amount, model, tokenizer):
    input_text = f"<|startoftext|>Business Type: {business_type}\nExpense: {description}\nAmount: {amount} NPR\nTax Category:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(input_ids, max_length=150, num_return_sequences=1,
                                temperature=0.7, top_k=50, top_p=0.95, do_sample=True)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    tax_category = generated_text.split("Tax Category:")[-1].strip()

    return tax_category

def generate_tax_advice(business_type, description, amount, tax_category):
    advice = f"For the expense '{description}' of NPR {amount} in your {business_type} business:\n\n"

    if tax_category == "Deductible Expense":
        advice += "This expense is fully deductible. Keep proper documentation for your tax records."
    elif tax_category == "Depreciable Asset":
        advice += "This is a depreciable asset. You can claim depreciation over its useful life."
    elif tax_category == "Non-Deductible Expense":
        advice += "This expense is not deductible for tax purposes. Consider if it's necessary for your business."
    elif tax_category == "Partially Deductible":
        advice += "This expense is partially deductible. Consult with a tax professional to determine the deductible portion."
    elif tax_category == "VAT Applicable":
        advice += "VAT is applicable on this expense. Ensure you have a valid VAT invoice to claim input tax credit."
    elif tax_category == "Capital Expenditure":
        advice += "This is a capital expenditure. It should be capitalized and depreciated over time rather than expensed immediately."
    else:
        advice += "The tax treatment of this expense is not straightforward. Consult with a tax professional for specific advice."

    return advice

# Load the fine-tuned model and tokenizer
logging.info("Loading fine-tuned model and tokenizer...")
model_path = "./finlytic_categorize_model"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
logging.info("Model and tokenizer loaded successfully.")

def get_expense_advice(business_type, description, amount):
    tax_category = categorize_expense(business_type, description, amount, model, tokenizer)
    advice = generate_tax_advice(business_type, description, amount, tax_category)
    return advice

# Function to process an input Excel file
def process_expense_file(input_file, output_file):
    logging.info(f"Processing input file: {input_file}")
    df = pd.read_excel(input_file)

    results = []
    for _, row in df.iterrows():
        advice = get_expense_advice(row['Business_Type'], row['Description'], row['Amount'])
        results.append({
            'Business_Type': row['Business_Type'],
            'Description': row['Description'],
            'Amount': row['Amount'],
            'Tax_Advice': advice
        })

    output_df = pd.DataFrame(results)
    output_df.to_excel(output_file, index=False)
    logging.info(f"Results saved to {output_file}")

# Main execution
if __name__ == "__main__":
    input_file = "input_expenses.xlsx"  # Replace with your input file name
    output_file = "expense_advice_output.xlsx"

    if not os.path.exists(input_file):
        logging.error(f"Input file {input_file} not found.")
    else:
        process_expense_file(input_file, output_file)
        logging.info("Expense categorization and advice generation completed.")