In [1]:
pip install langchain_community

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_community.llms import Ollama



In [2]:
llm = Ollama(model="llama2")
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
Taxi Utrecht, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'\nCertainly! Here are the expenses you provided, along with an appropriate category for each one:\n\nTaxi Utrecht - Transportation\nMinisterie van Justitie en Veiligheid - Government\nEtos AMSTERDAM NLD - Food and Beverage\nBistro Bar Amsterdam - Food and Beverage\n\nSo, the categories are:\nTransportation, Government, Food and Beverage, and Food and Beverage.'

Read the transaction data

In [4]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
data = pd.read_csv("Financial_Transactions_2023_2024.csv")
data.head()

Unnamed: 0,Date,Name / Description,Expense / Income,Amount (EUR)
0,2023-01-01,Home Repair,Expense,504.99
1,2023-01-02,Pharmacy Medication,Expense,1356.44
2,2023-01-03,Pizza Delivery,Expense,1621.8
3,2023-01-04,Laptop Repair,Expense,867.14
4,2023-01-05,Dividend Income,Income,1505.51


In [4]:
#Get unique transaction in the Name/Description column
unique_transactions = data["Name / Description"].unique()
len(unique_transactions)

94

Categorize bank transactions with Llama2

In [5]:
# Get index list
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 30, 60, 90, 94]

In [6]:
def categorize_transactions(transaction_names,llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: Monthly Apartment Rent - Housing and Utilities, Electricity Bill - Housing and Utilities, Uber Ride - Transportation etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')
    print(response)

    #Put in dataframe

    categories_data = pd.DataFrame({'Transaction vs Category' : response})
    categories_data[['Transaction', 'Category']] = categories_data['Transaction vs Category'].str.split(' - ', expand=True)
    
    return categories_data

In [7]:
# Test out the function
categorize_transactions('Restaurant Dining, Gym Membership, Netflix Subscription, Home Repair',
                        llm)

['', 'Of course! Here are the expenses you provided with appropriate categories:', '', '1. Restaurant Dining - Food and Beverage', '2. Gym Membership - Fitness', '3. Netflix Subscription - Entertainment', '4. Home Repair - Home Maintenance']


Unnamed: 0,Transaction vs Category,Transaction,Category
0,,,
1,Of course! Here are the expenses you provided ...,Of course! Here are the expenses you provided ...,
2,,,
3,1. Restaurant Dining - Food and Beverage,1. Restaurant Dining,Food and Beverage
4,2. Gym Membership - Fitness,2. Gym Membership,Fitness
5,3. Netflix Subscription - Entertainment,3. Netflix Subscription,Entertainment
6,4. Home Repair - Home Maintenance,4. Home Repair,Home Maintenance


In [8]:
#Intialise the categories_data_all dataframe
categories_data_all = pd.DataFrame()

#Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = '.'.join(transaction_names)

    categories_data = categorize_transactions(transaction_names, llm)
    categories_data_all = pd.concat([categories_data_all, categories_data], ignore_index=True)

['Sure, here are the expenses you provided with appropriate categories:', '', '1. Home Repair - Home Improvement', '2. Pharmacy Medication - Healthcare', '3. Pizza Delivery - Food', '4. Laptop Repair - Electronics', '5. Dividend Income - Investment', '6. Rental Income - Real Estate', '7. Yoga Retreat - Travel', '8. Book Purchase - Entertainment', '9. Electronics Store Purchase - Electronics', '10. Bicycle Repair - Transportation', '11. Art Exhibition Ticket - Entertainment', '12. Uber Ride - Transportation', '13. City Tour - Travel', '14. Massage Session - Wellness', '15. Parking Fee - Transportation', '16. Festival Ticket - Entertainment', '17. Electricity Bill - Utilities', '18. Consulting Fees - Business', '19. Vitamins and Supplements - Healthcare', '20. Software License - Technology', '21. Museum Entry - Entertainment', '22. Hair Salon Visit - Beauty', '23. Financial Advisor Consultation - Finance', '24. Amazon Purchase - Shopping', '25. Plumbing Services - Home Improvement', '26.

In [9]:
categories_data_all

Unnamed: 0,Transaction vs Category,Transaction,Category
0,"Sure, here are the expenses you provided with ...","Sure, here are the expenses you provided with ...",
1,,,
2,1. Home Repair - Home Improvement,1. Home Repair,Home Improvement
3,2. Pharmacy Medication - Healthcare,2. Pharmacy Medication,Healthcare
4,3. Pizza Delivery - Food,3. Pizza Delivery,Food
...,...,...,...
103,,,
104,1. Fitness App Subscription - Entertainment,1. Fitness App Subscription,Entertainment
105,2. Ski Pass - Recreation,2. Ski Pass,Recreation
106,3. Skin Care Products - Personal Care,3. Skin Care Products,Personal Care


In [10]:
#get unique categories in categories_data_all
unique_categories = categories_data_all['Category'].unique()
unique_categories

array([None, 'Home Improvement', 'Healthcare', 'Food', 'Electronics',
       'Investment', 'Real Estate', 'Travel', 'Entertainment',
       'Transportation', 'Wellness', 'Utilities', 'Business',
       'Technology', 'Beauty', 'Finance', 'Shopping',
       'Health and Wellness', 'Home and Garden', 'Professional Services',
       'Finances', 'Recreation', 'Personal Expenses', 'Pets',
       'Food and Dining', 'Housing and Utilities', 'Career', 'Household',
       'Philanthropy', 'Clothing', 'Art', 'Grooming', 'Fitness',
       'Beverages', 'Personal Care', 'Food and Beverage'], dtype=object)

In [11]:
#Drop NA Values
categories_data_all = categories_data_all.dropna()
# If category contains "Food|Beverages", then categorise as "Food and Beverages"
categories_data_all.loc[categories_data_all['Category'].str.contains("Food|Beverages"), 'Category'] = "Food and Beverages"
# If category contains "Utilities|Home|House", then categorise as "Housing and Utilities"
categories_data_all.loc[categories_data_all['Category'].str.contains("House|Home|Utilities"), 'Category'] = "Housing and Utilities"
# If category contains "Health|Wellness|Healthcare", then categorise as "Health and Wellness"
categories_data_all.loc[categories_data_all['Category'].str.contains("Health|Wellness|Healthcare"), 'Category'] = "Health and Wellness"
# If category contains "Shopping|Clothing|Beauty|Personal Care|Grooming", then categorise as "Clothing and Grooming"
categories_data_all.loc[categories_data_all['Category'].str.contains("Shopping|Clothing|Beauty|Personal Care|Grooming"), 'Category'] = "Clothing and Grooming"
# If category contains "Sports|Fitness", then categorise as "Sport and Fitness"
categories_data_all.loc[categories_data_all['Category'].str.contains("Sports|Fitness"), 'Category'] = "Sport and Fitness"
# If category contains "Real Estate|Finance|Investment|Finances", then categorise as "Finance and Banking"
categories_data_all.loc[categories_data_all['Category'].str.contains("Real Estate|Finance|Investment|Finances"), 'Category'] = "Finance and Banking"
# If category contains "Technology|Electronics", then categorise as "Electronics"
categories_data_all.loc[categories_data_all['Category'].str.contains("Technology|Electronics"), 'Category'] = "Electronics"

In [12]:
# Remove the numbering from Transaction column
categories_data_all['Transaction'] = categories_data_all['Transaction'].str.replace(r'\d+\.\s+', '', regex=True)
categories_data_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_data_all['Transaction'] = categories_data_all['Transaction'].str.replace(r'\d+\.\s+', '', regex=True)


Unnamed: 0,Transaction vs Category,Transaction,Category
2,1. Home Repair - Home Improvement,Home Repair,Housing and Utilities
3,2. Pharmacy Medication - Healthcare,Pharmacy Medication,Health and Wellness
4,3. Pizza Delivery - Food,Pizza Delivery,Food and Beverages
5,4. Laptop Repair - Electronics,Laptop Repair,Electronics
6,5. Dividend Income - Investment,Dividend Income,Finance and Banking
...,...,...,...
98,30. Chiropractic Session - Healthcare,Chiropractic Session,Health and Wellness
104,1. Fitness App Subscription - Entertainment,Fitness App Subscription,Entertainment
105,2. Ski Pass - Recreation,Ski Pass,Recreation
106,3. Skin Care Products - Personal Care,Skin Care Products,Clothing and Grooming


In [13]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)
data = pd.read_csv("Financial_Transactions_2023_2024.csv")
data.loc[data['Name / Description'].str.contains("Care"), 'Name / Description'] = "Skin Care Products"
data = pd.merge(data, categories_data_all, left_on='Name / Description', right_on='Transaction', how='left')
data

Unnamed: 0,Date,Name / Description,Expense / Income,Amount (EUR),Transaction vs Category,Transaction,Category
0,2023-01-01,Home Repair,Expense,504.99,1. Home Repair - Home Improvement,Home Repair,Housing and Utilities
1,2023-01-02,Pharmacy Medication,Expense,1356.44,2. Pharmacy Medication - Healthcare,Pharmacy Medication,Health and Wellness
2,2023-01-03,Pizza Delivery,Expense,1621.80,3. Pizza Delivery - Food,Pizza Delivery,Food and Beverages
3,2023-01-04,Laptop Repair,Expense,867.14,4. Laptop Repair - Electronics,Laptop Repair,Electronics
4,2023-01-05,Dividend Income,Income,1505.51,5. Dividend Income - Investment,Dividend Income,Finance and Banking
...,...,...,...,...,...,...,...
451,2024-03-27,Water Bill,Expense,1145.83,19. Water Bill - Utilities,Water Bill,Housing and Utilities
452,2024-03-28,Yoga Retreat,Expense,1550.49,7. Yoga Retreat - Travel,Yoga Retreat,Travel
453,2024-03-29,Massage Session,Expense,1800.70,14. Massage Session - Wellness,Massage Session,Health and Wellness
454,2024-03-30,Medical Insurance,Expense,811.59,10. Medical Insurance - Finances,Medical Insurance,Finance and Banking


In [14]:
data.to_csv("Financial_Transactions_2023_2024_categorized.csv", index=False)