In [26]:
# Cell 1: Import libraries
from dotenv import load_dotenv
import pandas as pd
from langchain_openai import ChatOpenAI

In [27]:
# Cell 2: Load your data
df = pd.read_csv('./data/transactions_2023_2024.csv')
print(df.head())
print(f"Total transactions: {len(df)}")

         Date    Name / Description   Amount Expense/Income
0  2025-01-05                Salary  75000.0         Income
1  2025-01-07            House Rent  25000.0        Expense
2  2025-01-09                Swiggy    920.0        Expense
3  2025-01-12  Electricity Provider   3300.0        Expense
4  2025-01-15      Freelance Client  18000.0         Income
Total transactions: 34


In [28]:
# Cell 3: Initialize AI model
load_dotenv()
llm = ChatOpenAI(model="gpt-4o-mini")

In [29]:
# Cell 4: Get unique merchant names

unique_transactions = df["Name / Description"].unique()
print(f"Unique Merchant Names: {len(unique_transactions)}")
print(unique_transactions[:10]) # Display first 10 unique merchant names


Unique Merchant Names: 17
['Salary' 'House Rent' 'Swiggy' 'Electricity Provider' 'Freelance Client'
 'Petrol Station' 'Amazon' 'Mobile Recharge' 'Stock Dividend' 'Gym']


In [30]:
# Cell 5: Create categorization function
def categorize_batch(transaction_list):
    """Send batch of transactions to AI for categorization"""
    
    # Join transactions into one string
    transactions_text = ", ".join(transaction_list)
    
    # Create prompt
    prompt = f"""Categorize these expenses. Use categories like:
    - Groceries
    - Transportation
    - Entertainment
    - Dining
    - Shopping
    - Healthcare
    - Utilities
    - Rent
    - Travel
    
    Format response as: Transaction Name - Category
    
    Transactions: {transactions_text}
    """
    
    # Get AI response
    response = llm.invoke(prompt)
    
    # Extract text content from AIMessage object
    response_text = response.content
    
    # Parse response
    lines = response_text.split('\n')
    results = []
    
    for line in lines:
        if ' - ' in line:
            # Clean up numbering like "1. "
            line = line.strip()
            if line and line[0].isdigit():
                line = line.split('. ', 1)[1]
            
            # Remove leading dash if present
            if line.startswith('- '):
                line = line[2:]
            
            parts = line.split(' - ')
            if len(parts) == 2:
                results.append({
                    'Transaction': parts[0].strip(),
                    'Category': parts[1].strip()
                })
    
    return pd.DataFrame(results)

In [None]:
all_categories = pd.DataFrame()

# Process 10 transactions at a time
batch_size = 10
for i in range(0, len(unique_transactions), batch_size):
    batch = unique_transactions[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}...")
    
    categories_df = categorize_batch(batch)
    all_categories = pd.concat([all_categories, categories_df], ignore_index=True)
    
    # Show progress
    print(f"Categorized {len(all_categories)} so far")

print("Done!")
print(all_categories.head())

Processing batch 1...
Categorized 10 so far
Processing batch 2...
Categorized 10 so far
Processing batch 2...
Categorized 17 so far
Done!
            Transaction   Category
0                Salary     Income
1            House Rent       Rent
2                Swiggy     Dining
3  Electricity Provider  Utilities
4      Freelance Client     Income
Categorized 17 so far
Done!
            Transaction   Category
0                Salary     Income
1            House Rent       Rent
2                Swiggy     Dining
3  Electricity Provider  Utilities
4      Freelance Client     Income


In [32]:
print(all_categories)

             Transaction        Category
0                 Salary          Income
1             House Rent            Rent
2                 Swiggy          Dining
3   Electricity Provider       Utilities
4       Freelance Client          Income
5         Petrol Station  Transportation
6                 Amazon        Shopping
7        Mobile Recharge       Utilities
8         Stock Dividend          Income
9                    Gym      Healthcare
10     Internet Provider       Utilities
11                  Uber  Transportation
12            Big Bazaar       Groceries
13            Restaurant          Dining
14                 DMart       Groceries
15           Mutual Fund  Not applicable
16                Clinic      Healthcare


In [33]:
# Cell 7: Clean and merge with original data

# Remove any NaN values
all_categories = all_categories.dropna()

# Merge categories back to original dataframe
df_merged = df.merge(
    all_categories,
    left_on='Name / Description',
    right_on='Transaction',
    how='left'
)

# Drop duplicate 'Transaction' column
df_merged = df_merged.drop(columns=['Transaction'])


# save the results
df_merged.to_csv('./data/combined_transactions_categorized.csv', index=False)
print("Saved categorized transactions to 'combined_transactions_categorized.csv")
print(df_merged.head())



Saved categorized transactions to 'combined_transactions_categorized.csv
         Date    Name / Description   Amount Expense/Income   Category
0  2025-01-05                Salary  75000.0         Income     Income
1  2025-01-07            House Rent  25000.0        Expense       Rent
2  2025-01-09                Swiggy    920.0        Expense     Dining
3  2025-01-12  Electricity Provider   3300.0        Expense  Utilities
4  2025-01-15      Freelance Client  18000.0         Income     Income
