# Data Import

### Saving the last updated date

In [1]:
import os

try:
    with open('last_updated.txt', 'r') as f:
        lines = f.readlines()
        last_updated_date = lines[0].strip()

        if len(lines) > 1:
            last_updated_balance = float(lines[1].strip())
        else:
            last_updated_balance = float(input("Balance not found. Please enter the last known balance: "))        
except FileNotFoundError:
    print("File not found. Defaulting to a very old date and asking for balance.")
    last_updated_date = '2025-04-15'
    last_updated_balance = float(input("Please enter the last known balance: "))
except ValueError:
    print("Balance format invalid. Asking for balance.")
    last_updated_balance = float(input("Please enter the last known balance: "))

print(f"Last Updated Date: {last_updated_date}")
print(f"Last Updated Balance: {last_updated_balance}")


File not found. Defaulting to a very old date and asking for balance.
Last Updated Date: 2025-04-15
Last Updated Balance: 10000.0


## API

### Calling Splitwise API to extract groups, friends and basic details of expenses. Also converting the JSON file to dataframe

In [2]:
import requests

API_KEY = "Enter your API Key"
response = requests.get(
    "https://secure.splitwise.com/api/v3.0/get_expenses",
    headers={"Authorization": f"Bearer {API_KEY}"}
)

if response.status_code == 200:
    print("Success! Here are your expenses:")
    print(response.json())
else:
    print("Error:", response.status_code)
    print(response.text)

Success! Here are your expenses:
{'expenses': [{'id': 3833549448, 'group_id': 79916117, 'expense_bundle_id': None, 'description': 'Nawabs', 'repeats': False, 'repeat_interval': None, 'email_reminder': False, 'email_reminder_in_advance': -1, 'next_repeat': None, 'details': None, 'comments_count': 0, 'payment': False, 'creation_method': 'equal', 'transaction_method': 'offline', 'transaction_confirmed': False, 'transaction_id': None, 'transaction_status': None, 'cost': '56.0', 'currency_code': 'CAD', 'repayments': [{'from': 35378642, 'to': 76291542, 'amount': '28.0'}], 'date': '2025-05-30T00:26:32Z', 'created_at': '2025-05-30T00:26:52Z', 'created_by': {'id': 76291542, 'first_name': 'Aswin', 'last_name': 'Narayanan', 'picture': {'medium': 'https://s3.amazonaws.com/splitwise/uploads/user/default_avatars/avatar-orange15-100px.png'}, 'custom_picture': False}, 'updated_at': '2025-05-30T00:26:52Z', 'updated_by': None, 'deleted_at': None, 'deleted_by': None, 'category': {'id': 18, 'name': 'Gener

In [3]:
import pandas as pd
import requests
from datetime import datetime
from collections import defaultdict

# 1. Set up API connection
API_KEY = "Bearer Enter your API Key"
BASE_URL = "https://secure.splitwise.com/api/v3.0"

def get_groups():
    response = requests.get(
        f"{BASE_URL}/get_groups",
        headers={"Authorization": API_KEY}
    )
    return {g['id']: g['name'] for g in response.json().get('groups', [])}

def get_all_users():
    """Get all unique users from friends, current user, and expense repayments"""
    # Get friends
    friends_response = requests.get(
        f"{BASE_URL}/get_friends",
        headers={"Authorization": API_KEY}
    )
    friends = friends_response.json().get('friends', [])
    
    # Get current user
    current_user_response = requests.get(
        f"{BASE_URL}/get_current_user",
        headers={"Authorization": API_KEY}
    )
    current_user = current_user_response.json().get('user', {})
    
    # Get all expenses to find users in repayments
    expenses_response = requests.get(
        f"{BASE_URL}/get_expenses",
        headers={"Authorization": API_KEY},
        params={"limit": "1000"}
    )
    expenses = expenses_response.json().get('expenses', [])
    
    # Collect all unique users  
    users = defaultdict(dict)
    
    # Add friends
    for user in friends:
        user_id = user.get('id')
        users[user_id] = {
            'name': user.get('name') or f"{user.get('first_name', '')} {user.get('last_name', '')}".strip(),
            'active': True
        }
    
    # Add current user
    if current_user:
        user_id = current_user.get('id')
        users[user_id] = {
            'name': current_user.get('name') or f"{current_user.get('first_name', '')} {current_user.get('last_name', '')}".strip(),
            'active': True
        }
    
    # Add users from repayments (mark as inactive if not already found)
    for expense in expenses:
        for repayment in expense.get('repayments', []):
            for user_type in ['to_user', 'from_user']:
                user_id = repayment.get(user_type)
                if user_id and user_id not in users:
                    users[user_id] = {
                        'name': f"User {user_id} (removed)",
                        'active': False
                    }
    
    return users

def get_expenses_dataframe():
    # Get all users
    all_users = get_all_users()
    groups = get_groups()
    
    # Get expenses
    response = requests.get(
        f"{BASE_URL}/get_expenses",
        headers={"Authorization": API_KEY},
        params={"limit": "2000",
                "updated_after": last_updated_date}
    )
    expenses = response.json().get('expenses', [])
    
    # Create base columns
    base_columns = ['Date', 'Group', 'Description', 'Category', 'Cost', 'Currency']
    
    # Create user columns (active users first, then inactive)
    active_users = [u['name'] for u in all_users.values() if u['active']]
    inactive_users = [u['name'] for u in all_users.values() if not u['active']]
    user_columns = active_users + inactive_users
    
    # Process each expense
    rows = []
    for expense in expenses:
        # Basic info
        row = {
            'Date': expense.get('date'),
            'Group': groups.get(expense.get('group_id'), 'No Group'),
            'Description': expense.get('description'),
            'Category': expense.get('category', {}).get('name'),
            'Cost': expense.get('cost'),
            'Currency': expense.get('currency_code')
        }
        
        # Initialize all user amounts to 0
        row.update({name: 0.0 for name in user_columns})
        
        for user_data in expense.get('users', []):
            user_info = user_data.get('user', {})
            full_name = f"{user_info.get('first_name', '')} {user_info.get('last_name', '')}".strip()
            net_balance = float(user_data.get('net_balance', 0))
        
            # Only update if this user exists in our columns
            if full_name in user_columns:
                row[full_name] = net_balance
        
        rows.append(row)
    
    # Create DataFrame
    split_df = pd.DataFrame(rows, columns=base_columns + user_columns)
    
    # Format date
    if not split_df.empty and 'Date' in split_df:
        split_df['Date'] = pd.to_datetime(split_df['Date']).dt.date
    
    # Format numeric columns
    numeric_cols = ['Cost'] + user_columns
    split_df[numeric_cols] = split_df[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0).round(2)
    
    return split_df

# Run and save
try:
    split_df = get_expenses_dataframe()
    print(split_df.head())
except Exception as e:
    print(f"Error occurred: {e}")

         Date             Group Description            Category   Cost  \
0  2025-05-30   171 Anne Street      Nawabs             General  56.00   
1  2025-05-30   171 Anne Street    Wallmart  Household supplies  57.00   
2  2025-05-24  358 Madelaine Dr     Payment             General  44.57   
3  2025-05-23  358 Madelaine Dr     Payment             General  44.75   
4  2025-05-20  358 Madelaine Dr     Payment             General  37.66   

  Currency  Aakash Raj  Preethi None  Aditya None  Rushda Najeeb  Beuju A  \
0      CAD         0.0           0.0          0.0           0.00      0.0   
1      CAD         0.0           0.0          0.0           0.00      0.0   
2      CAD         0.0           0.0          0.0           0.00      0.0   
3      CAD         0.0           0.0          0.0          44.75      0.0   
4      CAD         0.0           0.0          0.0           0.00      0.0   

   Deepak None  Aswin Narayanan  sai meghana  Praveen Kumar  Divyaa Saravanan  \
0         0

### Extracting the payment information into readable format. Also calculating the amount contributed by the payer in the transaction.

#### For Example: In an equal expense of CAD 30 which involves Aakash, Sanjay and Aditya, for which Aditya paid, Aditya contributed CAD 10.

In [4]:
split_df["Payment_or_not"] = split_df['Description']

In [5]:
name_columns = split_df.columns.difference(['Date', 'Group', 'Description', 'Category', 'Cost', 'Currency', 'Payment_or_not'])

def update_description(row):
    if row['Description'] == 'Payment':
        payee = row[name_columns][row[name_columns] < 0]
        payer = row[name_columns][row[name_columns] > 0]
        if len(payer) > 0 and len(payee) > 0:
            payee_name = payee.index[0]
            payer_name = payer.index[0]
            return f"{payer_name} paid {payee_name}"
    return row['Description']

split_df['Description'] = split_df.apply(update_description, axis=1)

def adjust_contribution(row):
    if row['Payment_or_not'] == 'Payment':
        payer = row[name_columns][row[name_columns] > 0]
        payee = row[name_columns][row[name_columns] < 0]
        if len(payer) > 0 and len(payee) > 0:
            payer_name = payer.index[0]
            row[payer_name] = -abs(float(row[payer_name]))
            payee_name = payee.index[0]
            row[payee_name] = -(float(row[payee_name]))
        return row

    else:
        payer = row[name_columns][row[name_columns] > 0]
        payee = row[name_columns][row[name_columns] < 0]
        if len(payer) > 0 and len(payee) > 0 and payer.iloc[0] == row['Cost']:
            pass
        elif len(payer) == 1:
            payer_name = payer.index[0]
            total_owed_by_others = -payee.sum()
            actual_paid = row['Cost']
            row[payer_name] = -(actual_paid - total_owed_by_others)

        return row

split_df = split_df.apply(adjust_contribution, axis=1)

In [6]:
#split_df.to_csv("trial.csv", index=False)

## Bank details using Excel

In [7]:
bank_df = pd.read_excel(r"C:\Users\sanja\Documents\Projects\API\Simplii.xlsx")
bank_df

Unnamed: 0,Date,TRANSACTIONS,Funds out,Funds in,Running Balance
0,2025-05-26,POS MERCHANDISE ESSO CIRCLE K,14.97,Not applicable,15781.10
1,2025-05-26,INTERAC E-TRANSFER RECEIVE BEULAH CHR ISRAEL,Not applicable,200,15796.07
2,2025-05-23,INTERAC E-TRANSFER SEND Divyaa Barrie,44.57,Not applicable,15596.07
3,2025-05-23,POS MERCHANDISE ROYAL PAAN BAR,15.81,Not applicable,15640.64
4,2025-05-22,POS MERCHANDISE UPE EXPRESS PEA,20,Not applicable,15656.45
...,...,...,...,...,...
214,2024-11-20,POS MERCHANDISE BEST BUY 953,999.99,Not applicable,2323.75
215,2024-11-12,TRANSFER CREDIT TO CURRENT REFERRAL BONUS23176...,Not applicable,125,3323.74
216,2024-11-08,PAYROLL DEPOSIT THE ROYALE LP,Not applicable,998.74,3198.74
217,2024-11-04,ABM DEPOSIT,Not applicable,2100,2200.00


In [8]:
creditbank_df = pd.read_excel(r"C:\Users\sanja\Documents\Projects\API\CreditCard.xlsx")
creditbank_df

Unnamed: 0,TRANSACTION DATE,DETAILS,AMOUNT
0,2025-05-05,Personal & Household ExpensesBELL MOBILITY VER...,−$55.95
1,2025-05-05,PAYMENT THANK YOU/PAIEMEN T MERCI,51.99
2,2025-04-03,Personal & Household ExpensesBELL MOBILITY VER...,−$51.99
3,2025-04-03,PAYMENT THANK YOU/PAIEMEN T MERCI,70.28
4,2025-03-15,"RestaurantsTGTG vybrjpkwnbyb1 VANCOUVER, BC",−$4.99
...,...,...,...
88,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35
89,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54
90,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",−$27.54
91,2023-02-18,Personal & Household ExpensesROGERS ******7647...,−$35.50


In [9]:
split_df 

Unnamed: 0,Date,Group,Description,Category,Cost,Currency,Aakash Raj,Preethi None,Aditya None,Rushda Najeeb,Beuju A,Deepak None,Aswin Narayanan,sai meghana,Praveen Kumar,Divyaa Saravanan,Vishnu Pratheep,Sanjay Kurian,Payment_or_not
0,2025-05-30,171 Anne Street,Nawabs,General,56.0,CAD,0.0,0.0,0.0,0.0,0.0,0.0,-28.0,0.0,0.0,0.0,0.0,-28.0,Nawabs
1,2025-05-30,171 Anne Street,Wallmart,Household supplies,57.0,CAD,0.0,0.0,0.0,0.0,0.0,0.0,-28.5,0.0,0.0,0.0,0.0,-28.5,Wallmart
2,2025-05-24,358 Madelaine Dr,Sanjay Kurian paid Deepak None,General,44.57,CAD,0.0,0.0,0.0,0.0,0.0,44.57,0.0,0.0,0.0,0.0,0.0,-44.57,Payment
3,2025-05-23,358 Madelaine Dr,Rushda Najeeb paid Deepak None,General,44.75,CAD,0.0,0.0,0.0,-44.75,0.0,44.75,0.0,0.0,0.0,0.0,0.0,0.0,Payment
4,2025-05-20,358 Madelaine Dr,Praveen Kumar paid Deepak None,General,37.66,CAD,0.0,0.0,0.0,0.0,0.0,37.66,0.0,0.0,-37.66,0.0,0.0,0.0,Payment
5,2025-05-20,358 Madelaine Dr,Praveen Kumar paid Deepak None,General,37.66,CAD,0.0,0.0,0.0,0.0,0.0,37.66,0.0,0.0,-37.66,0.0,0.0,0.0,Payment
6,2025-05-20,358 Madelaine Dr,Aakash Raj paid Deepak None,General,44.73,CAD,-44.73,0.0,0.0,0.0,0.0,44.73,0.0,0.0,0.0,0.0,0.0,0.0,Payment
7,2025-05-19,358 Madelaine Dr,Beuju A paid Deepak None,General,114.51,CAD,0.0,0.0,0.0,0.0,-114.51,114.51,0.0,0.0,0.0,0.0,0.0,0.0,Payment
8,2025-05-10,358 Madelaine Dr,Aswin Narayanan paid Deepak None,General,241.05,CAD,0.0,0.0,0.0,0.0,0.0,241.05,-241.05,0.0,0.0,0.0,0.0,0.0,Payment
9,2025-05-10,358 Madelaine Dr,Inpower utilities,General,171.29,CAD,-21.41,-21.41,0.0,-21.42,-21.41,171.29,-21.41,0.0,-21.41,0.0,-21.41,-21.41,Inpower utilities


In [10]:
#split_df.describe()

# Data Preprocessing

In [11]:
columns_to_keep = ["Date", "Description", "Category", "Cost", "Sanjay Kurian"]
split_df = split_df[columns_to_keep]

In [12]:
split_df

Unnamed: 0,Date,Description,Category,Cost,Sanjay Kurian
0,2025-05-30,Nawabs,General,56.0,-28.0
1,2025-05-30,Wallmart,Household supplies,57.0,-28.5
2,2025-05-24,Sanjay Kurian paid Deepak None,General,44.57,-44.57
3,2025-05-23,Rushda Najeeb paid Deepak None,General,44.75,0.0
4,2025-05-20,Praveen Kumar paid Deepak None,General,37.66,0.0
5,2025-05-20,Praveen Kumar paid Deepak None,General,37.66,0.0
6,2025-05-20,Aakash Raj paid Deepak None,General,44.73,0.0
7,2025-05-19,Beuju A paid Deepak None,General,114.51,0.0
8,2025-05-10,Aswin Narayanan paid Deepak None,General,241.05,0.0
9,2025-05-10,Inpower utilities,General,171.29,-21.41


### Extracing all the expenses which involves Me.

In [13]:
split_df = split_df[split_df["Sanjay Kurian"] != 0]
split_df = split_df[split_df["Description"] != "Total balance"]
split_df

Unnamed: 0,Date,Description,Category,Cost,Sanjay Kurian
0,2025-05-30,Nawabs,General,56.0,-28.0
1,2025-05-30,Wallmart,Household supplies,57.0,-28.5
2,2025-05-24,Sanjay Kurian paid Deepak None,General,44.57,-44.57
9,2025-05-10,Inpower utilities,General,171.29,-21.41


In [14]:
split_df.dtypes

Date              object
Description       object
Category          object
Cost             float64
Sanjay Kurian    float64
dtype: object

In [15]:
split_df["Cost"] = pd.to_numeric(split_df["Cost"], errors="coerce")

In [16]:
# mask = (split_df["Sanjay Kurian"] > 0) & (split_df["Sanjay Kurian"] == split_df["Cost"]) & (split_df["Category"] == "Payment")
# split_df.loc[mask, "Sanjay Kurian"] = split_df.loc[mask, "Sanjay Kurian"]

In [17]:
split_df

Unnamed: 0,Date,Description,Category,Cost,Sanjay Kurian
0,2025-05-30,Nawabs,General,56.0,-28.0
1,2025-05-30,Wallmart,Household supplies,57.0,-28.5
2,2025-05-24,Sanjay Kurian paid Deepak None,General,44.57,-44.57
9,2025-05-10,Inpower utilities,General,171.29,-21.41


In [18]:
# unique_descriptions = split_df["Description"].value_counts()
# unique_descriptions

In [19]:
# known_descriptions = ["Walmart", "Costco", "Uber", "Centra", "Dollarama", "Wifi", "LCBO", "Subway"]

# split_df["Description_clean"] = split_df["Description"].str.strip()

# # Get all unique descriptions
# unique_descriptions = split_df["Description_clean"].unique()

# # Create an empty mapping dictionary
# description_map = {}

# for desc in unique_descriptions:
#     if desc in known_descriptions:
#         description_map[desc] = desc  # keep as-is
#     else:
#         corrected = input(f"Enter a standardized name for: '{desc}' → ")
#         description_map[desc] = corrected

# # Apply the mapping to create a normalized column
# split_df["Description_normalized"] = split_df["Description_clean"].map(description_map)

In [20]:
#split_df

In [21]:
# unique_descriptions = split_df["Description_normalized"].value_counts()
# unique_descriptions

In [22]:
# split_df[split_df["Description_normalized"] == ""]

In [23]:
# split_df["Description_normalized"] = split_df["Description_normalized"].fillna("Walmart")

In [24]:
#split_df.to_excel("Splitwise_Cleaned.xlsx", index=False)

In [25]:
bank_df

Unnamed: 0,Date,TRANSACTIONS,Funds out,Funds in,Running Balance
0,2025-05-26,POS MERCHANDISE ESSO CIRCLE K,14.97,Not applicable,15781.10
1,2025-05-26,INTERAC E-TRANSFER RECEIVE BEULAH CHR ISRAEL,Not applicable,200,15796.07
2,2025-05-23,INTERAC E-TRANSFER SEND Divyaa Barrie,44.57,Not applicable,15596.07
3,2025-05-23,POS MERCHANDISE ROYAL PAAN BAR,15.81,Not applicable,15640.64
4,2025-05-22,POS MERCHANDISE UPE EXPRESS PEA,20,Not applicable,15656.45
...,...,...,...,...,...
214,2024-11-20,POS MERCHANDISE BEST BUY 953,999.99,Not applicable,2323.75
215,2024-11-12,TRANSFER CREDIT TO CURRENT REFERRAL BONUS23176...,Not applicable,125,3323.74
216,2024-11-08,PAYROLL DEPOSIT THE ROYALE LP,Not applicable,998.74,3198.74
217,2024-11-04,ABM DEPOSIT,Not applicable,2100,2200.00


* Replacing Not applicable to 0
* Negating funds out to show outward cash flow
* Combining Funds in and Funds out as Funds

In [26]:
bank_df[['Funds out', 'Funds in']] = bank_df[['Funds out', 'Funds in']].replace('Not applicable', 0)
bank_df["Funds out"] = -bank_df["Funds out"]
bank_df["Funds out"] = pd.to_numeric(bank_df["Funds out"])
bank_df["Funds in"] = pd.to_numeric(bank_df["Funds in"])
bank_df["Funds"] = bank_df["Funds in"] + bank_df["Funds out"]

  bank_df[['Funds out', 'Funds in']] = bank_df[['Funds out', 'Funds in']].replace('Not applicable', 0)


In [27]:
bank_df

Unnamed: 0,Date,TRANSACTIONS,Funds out,Funds in,Running Balance,Funds
0,2025-05-26,POS MERCHANDISE ESSO CIRCLE K,-14.97,0.00,15781.10,-14.97
1,2025-05-26,INTERAC E-TRANSFER RECEIVE BEULAH CHR ISRAEL,-0.00,200.00,15796.07,200.00
2,2025-05-23,INTERAC E-TRANSFER SEND Divyaa Barrie,-44.57,0.00,15596.07,-44.57
3,2025-05-23,POS MERCHANDISE ROYAL PAAN BAR,-15.81,0.00,15640.64,-15.81
4,2025-05-22,POS MERCHANDISE UPE EXPRESS PEA,-20.00,0.00,15656.45,-20.00
...,...,...,...,...,...,...
214,2024-11-20,POS MERCHANDISE BEST BUY 953,-999.99,0.00,2323.75,-999.99
215,2024-11-12,TRANSFER CREDIT TO CURRENT REFERRAL BONUS23176...,-0.00,125.00,3323.74,125.00
216,2024-11-08,PAYROLL DEPOSIT THE ROYALE LP,-0.00,998.74,3198.74,998.74
217,2024-11-04,ABM DEPOSIT,-0.00,2100.00,2200.00,2100.00


In [28]:
bank_df.drop(columns=["Funds in", "Funds out"], inplace=True)

In [29]:
bank_df = bank_df.dropna(subset=["Date"])

In [30]:
bank_df

Unnamed: 0,Date,TRANSACTIONS,Running Balance,Funds
0,2025-05-26,POS MERCHANDISE ESSO CIRCLE K,15781.10,-14.97
1,2025-05-26,INTERAC E-TRANSFER RECEIVE BEULAH CHR ISRAEL,15796.07,200.00
2,2025-05-23,INTERAC E-TRANSFER SEND Divyaa Barrie,15596.07,-44.57
3,2025-05-23,POS MERCHANDISE ROYAL PAAN BAR,15640.64,-15.81
4,2025-05-22,POS MERCHANDISE UPE EXPRESS PEA,15656.45,-20.00
...,...,...,...,...
214,2024-11-20,POS MERCHANDISE BEST BUY 953,2323.75,-999.99
215,2024-11-12,TRANSFER CREDIT TO CURRENT REFERRAL BONUS23176...,3323.74,125.00
216,2024-11-08,PAYROLL DEPOSIT THE ROYALE LP,3198.74,998.74
217,2024-11-04,ABM DEPOSIT,2200.00,2100.00


### Replacing all the common transactions with common terms

In [31]:
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("INTERAC E-TRANSFER ", "", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("POS MERCHANDISE ", "", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("POS PURCHASE ", "", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("PAYROLL DEPOSIT THE ROYALE LP", "Salary", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("EFT CREDIT CANADA", "Tax Returns", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("EFT CREDIT TPS/GST", "Tax Returns", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("ABM WITHDRAWAL", "ATM Withdrawal", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("ABM DEPOSIT", "ATM Deposit", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("EFT CREDIT ISBO ReferralCR", "Referral", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("REMISE CARBONE/CARBON REBATE", "Tax Returns", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("EFT CREDIT NO FEE CASH REWARD", "Referral", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("INTERNET BILL PAYMENT VISA, CIBC/BANQUE CIBC", "Credit Card Bill", regex=False)
bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("TRANSFER CREDIT TO CURRENT REFERRAL BONUS231768381", "Referral", regex=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("INTERAC E-TRANSFER ", "", regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_df["TRANSACTIONS"] = bank_df["TRANSACTIONS"].str.replace("POS MERCHANDISE ", "", regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank

In [32]:
bank_df.drop("Running Balance", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_df.drop("Running Balance", axis=1, inplace=True)


In [33]:
bank_df

Unnamed: 0,Date,TRANSACTIONS,Funds
0,2025-05-26,ESSO CIRCLE K,-14.97
1,2025-05-26,RECEIVE BEULAH CHR ISRAEL,200.00
2,2025-05-23,SEND Divyaa Barrie,-44.57
3,2025-05-23,ROYAL PAAN BAR,-15.81
4,2025-05-22,UPE EXPRESS PEA,-20.00
...,...,...,...
214,2024-11-20,BEST BUY 953,-999.99
215,2024-11-12,Referral,125.00
216,2024-11-08,Salary,998.74
217,2024-11-04,ATM Deposit,2100.00


In [34]:
creditbank_df

Unnamed: 0,TRANSACTION DATE,DETAILS,AMOUNT
0,2025-05-05,Personal & Household ExpensesBELL MOBILITY VER...,−$55.95
1,2025-05-05,PAYMENT THANK YOU/PAIEMEN T MERCI,51.99
2,2025-04-03,Personal & Household ExpensesBELL MOBILITY VER...,−$51.99
3,2025-04-03,PAYMENT THANK YOU/PAIEMEN T MERCI,70.28
4,2025-03-15,"RestaurantsTGTG vybrjpkwnbyb1 VANCOUVER, BC",−$4.99
...,...,...,...
88,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35
89,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54
90,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",−$27.54
91,2023-02-18,Personal & Household ExpensesROGERS ******7647...,−$35.50


In [35]:
creditbank_df["AMOUNT"] = (
    creditbank_df["AMOUNT"]
    .astype(str)
    .str.replace("\u2212", "-", regex=False)
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
)
creditbank_df["AMOUNT"] = pd.to_numeric(creditbank_df["AMOUNT"], errors="coerce")

In [36]:
creditbank_df

Unnamed: 0,TRANSACTION DATE,DETAILS,AMOUNT
0,2025-05-05,Personal & Household ExpensesBELL MOBILITY VER...,-55.95
1,2025-05-05,PAYMENT THANK YOU/PAIEMEN T MERCI,51.99
2,2025-04-03,Personal & Household ExpensesBELL MOBILITY VER...,-51.99
3,2025-04-03,PAYMENT THANK YOU/PAIEMEN T MERCI,70.28
4,2025-03-15,"RestaurantsTGTG vybrjpkwnbyb1 VANCOUVER, BC",-4.99
...,...,...,...
88,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35.00
89,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54
90,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",-27.54
91,2023-02-18,Personal & Household ExpensesROGERS ******7647...,-35.50


In [37]:
creditbank_df["DETAILS"] = creditbank_df["DETAILS"].str.replace("Personal & Household ExpensesBELL MOBILITY VERDUN, QC", "Phone data", regex=False)
creditbank_df = creditbank_df.rename(columns={
    "TRANSACTION DATE": "Date",
    "DETAILS": "TRANSACTIONS",
    "AMOUNT": "Funds"
})
creditbank_df["Account"] = "CIBC Credit Card"
creditbank_df

Unnamed: 0,Date,TRANSACTIONS,Funds,Account
0,2025-05-05,Phone data,-55.95,CIBC Credit Card
1,2025-05-05,PAYMENT THANK YOU/PAIEMEN T MERCI,51.99,CIBC Credit Card
2,2025-04-03,Phone data,-51.99,CIBC Credit Card
3,2025-04-03,PAYMENT THANK YOU/PAIEMEN T MERCI,70.28,CIBC Credit Card
4,2025-03-15,"RestaurantsTGTG vybrjpkwnbyb1 VANCOUVER, BC",-4.99,CIBC Credit Card
...,...,...,...,...
88,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35.00,CIBC Credit Card
89,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54,CIBC Credit Card
90,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",-27.54,CIBC Credit Card
91,2023-02-18,Personal & Household ExpensesROGERS ******7647...,-35.50,CIBC Credit Card


In [38]:
#split_df.drop(columns=["Description","Category","Cost","Description_clean"])
split_df.drop(columns=["Category","Cost"])

Unnamed: 0,Date,Description,Sanjay Kurian
0,2025-05-30,Nawabs,-28.0
1,2025-05-30,Wallmart,-28.5
2,2025-05-24,Sanjay Kurian paid Deepak None,-44.57
9,2025-05-10,Inpower utilities,-21.41


In [39]:
split_df = split_df[["Date", "Description", "Sanjay Kurian"]]
split_df = split_df.rename(columns={
    "Description": "TRANSACTIONS",
    "Sanjay Kurian": "Funds"
})

In [40]:
split_df

Unnamed: 0,Date,TRANSACTIONS,Funds
0,2025-05-30,Nawabs,-28.0
1,2025-05-30,Wallmart,-28.5
2,2025-05-24,Sanjay Kurian paid Deepak None,-44.57
9,2025-05-10,Inpower utilities,-21.41


In [41]:
split_df["Date"] = pd.to_datetime(split_df["Date"]).dt.date
bank_df["Date"] = pd.to_datetime(bank_df["Date"]).dt.date
creditbank_df["Date"] = pd.to_datetime(creditbank_df["Date"]).dt.date
split_df["Account"] = "Chequing"
bank_df["Account"] = "Chequing"
combined_df = pd.concat([split_df, bank_df, creditbank_df])
combined_df = combined_df.sort_values(by="Date").reset_index(drop=True)
combined_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_df["Date"] = pd.to_datetime(bank_df["Date"]).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_df["Account"] = "Chequing"


Unnamed: 0,Date,TRANSACTIONS,Funds,Account
0,2023-01-22,Personal & Household ExpensesROGERS ******7647...,-35.00,CIBC Credit Card
1,2023-02-18,Personal & Household ExpensesROGERS ******7647...,-35.50,CIBC Credit Card
2,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",-27.54,CIBC Credit Card
3,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54,CIBC Credit Card
4,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35.00,CIBC Credit Card
...,...,...,...,...
304,2025-05-24,Sanjay Kurian paid Deepak None,-44.57,Chequing
305,2025-05-26,ESSO CIRCLE K,-14.97,Chequing
306,2025-05-26,RECEIVE BEULAH CHR ISRAEL,200.00,Chequing
307,2025-05-30,Wallmart,-28.50,Chequing


In [42]:
#combined_df.to_excel("Combined_Cleaned.xlsx", index=False)

### Splitwise has records of payment and the exact amount is recorded in the bank transactions. I deleted both the records or else expenses may get duplicated

In [43]:
combined_df["Name"] = combined_df["TRANSACTIONS"].str.extract(r'(?:SEND|paid|RECEIVE)\s+([A-Za-z]+)', expand=False)
combined_df.loc[combined_df["Name"] == 'Sanjay', "Name"] = combined_df["TRANSACTIONS"].str.extract(r'^(\w+)', expand=False)
combined_df["Name"] = combined_df["Name"].str.capitalize()
combined_df["AbsFunds"] = combined_df["Funds"]

In [44]:
combined_df

Unnamed: 0,Date,TRANSACTIONS,Funds,Account,Name,AbsFunds
0,2023-01-22,Personal & Household ExpensesROGERS ******7647...,-35.00,CIBC Credit Card,,-35.00
1,2023-02-18,Personal & Household ExpensesROGERS ******7647...,-35.50,CIBC Credit Card,,-35.50
2,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",-27.54,CIBC Credit Card,,-27.54
3,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54,CIBC Credit Card,,27.54
4,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35.00,CIBC Credit Card,,35.00
...,...,...,...,...,...,...
304,2025-05-24,Sanjay Kurian paid Deepak None,-44.57,Chequing,Deepak,-44.57
305,2025-05-26,ESSO CIRCLE K,-14.97,Chequing,,-14.97
306,2025-05-26,RECEIVE BEULAH CHR ISRAEL,200.00,Chequing,Beulah,200.00
307,2025-05-30,Wallmart,-28.50,Chequing,,-28.50


In [45]:
mask_paid = combined_df["TRANSACTIONS"].str.contains("paid", case=False)
mask_receive = combined_df["TRANSACTIONS"].str.contains("RECEIVE", case=False)
mask_send = combined_df["TRANSACTIONS"].str.contains("SEND", case=False)

paid_trans = combined_df[mask_paid]
receive_trans = combined_df[mask_receive]
send_trans = combined_df[mask_send]

to_delete_indices = []

for _, paid_row in paid_trans.iterrows():
    matching_receives = receive_trans[
        (receive_trans["Name"] == paid_row["Name"]) & 
        (receive_trans["AbsFunds"] == paid_row["AbsFunds"])
    ]

    if not matching_receives.empty:
        to_delete_indices.extend([paid_row.name, matching_receives.index[0]])
        receive_trans = receive_trans.drop(matching_receives.index[0])

for _, paid_row in paid_trans.iterrows():
    matching_sends = send_trans[
        (send_trans["Name"] == paid_row["Name"]) & 
        (send_trans["AbsFunds"] == paid_row["AbsFunds"])
    ]
    
    if not matching_sends.empty:
        to_delete_indices.extend([paid_row.name, matching_sends.index[0]])
        send_trans = send_trans.drop(matching_sends.index[0])
to_delete_indices = list(set(to_delete_indices))
cleaned_df = combined_df[~combined_df.index.isin(to_delete_indices)]

In [46]:
cleaned_df = cleaned_df.drop(columns=["Name", "AbsFunds"])
cleaned_df = cleaned_df[~cleaned_df["TRANSACTIONS"].str.contains("paid", case=False, na=False)]
cleaned_df

Unnamed: 0,Date,TRANSACTIONS,Funds,Account
0,2023-01-22,Personal & Household ExpensesROGERS ******7647...,-35.00,CIBC Credit Card
1,2023-02-18,Personal & Household ExpensesROGERS ******7647...,-35.50,CIBC Credit Card
2,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",-27.54,CIBC Credit Card
3,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54,CIBC Credit Card
4,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35.00,CIBC Credit Card
...,...,...,...,...
303,2025-05-23,SEND Divyaa Barrie,-44.57,Chequing
305,2025-05-26,ESSO CIRCLE K,-14.97,Chequing
306,2025-05-26,RECEIVE BEULAH CHR ISRAEL,200.00,Chequing
307,2025-05-30,Wallmart,-28.50,Chequing


In [47]:
#cleaned_df.to_excel("Combined_Cleaned_Transactions.xlsx", index=False)

In [48]:
notes_df = cleaned_df[["TRANSACTIONS"]]
notes_df

Unnamed: 0,TRANSACTIONS
0,Personal & Household ExpensesROGERS ******7647...
1,Personal & Household ExpensesROGERS ******7647...
2,"Hotels, Entertainment, and RecreationEVENTBRIT..."
3,"Hotels, Entertainment, and RecreationEVENTBRIT..."
4,PAYMENT THANK YOU/PAIEMEN T MERCI
...,...
303,SEND Divyaa Barrie
305,ESSO CIRCLE K
306,RECEIVE BEULAH CHR ISRAEL
307,Wallmart


### Importing an excel file which contains all the common categories for expenses. Easier to categorize the expenses.

In [49]:
reference_df = pd.read_excel(r"C:\Users\sanja\Documents\Projects\API\Categories.xlsx")

In [50]:
reference_df

Unnamed: 0,Notes,Category
0,Bus card,Transportation
1,Uber,Transportation
2,Cash withdrawal,Other
3,Salary,Salary
4,Pizza pizza,Food
...,...,...
178,Bestbuy,Culture
179,Car Rental,Transportation
180,Bentley,Apparel
181,ATM Deposit,Other


### Using Rapid fuzz to convert all the typos in the splitwise to the correct transaction and mapping to appropriate category.

In [51]:
from rapidfuzz import process

correct_names = reference_df['Notes'].tolist()

# Function to apply fuzzy matching
def match_transaction(txn):
    best_match, score, _ = process.extractOne(txn, correct_names)
    if score >= 90:
        category = reference_df.loc[reference_df['Notes'] == best_match, 'Category'].values[0]
        return pd.Series([best_match, category, score])
    else:
        return pd.Series([None, None, score])

# Apply the matching function to your messy list of notes
cleaned_df[['Matched_Name', 'Category', 'Score']] = cleaned_df['TRANSACTIONS'].apply(match_transaction)

In [52]:
cleaned_df

Unnamed: 0,Date,TRANSACTIONS,Funds,Account,Matched_Name,Category,Score
0,2023-01-22,Personal & Household ExpensesROGERS ******7647...,-35.00,CIBC Credit Card,,,57.272727
1,2023-02-18,Personal & Household ExpensesROGERS ******7647...,-35.50,CIBC Credit Card,,,57.272727
2,2023-02-22,"Hotels, Entertainment, and RecreationEVENTBRIT...",-27.54,CIBC Credit Card,,,85.500000
3,2023-02-24,"Hotels, Entertainment, and RecreationEVENTBRIT...",27.54,CIBC Credit Card,,,85.500000
4,2023-03-03,PAYMENT THANK YOU/PAIEMEN T MERCI,35.00,CIBC Credit Card,,,30.000000
...,...,...,...,...,...,...,...
303,2025-05-23,SEND Divyaa Barrie,-44.57,Chequing,,,51.428571
305,2025-05-26,ESSO CIRCLE K,-14.97,Chequing,,,85.500000
306,2025-05-26,RECEIVE BEULAH CHR ISRAEL,200.00,Chequing,,,40.000000
307,2025-05-30,Wallmart,-28.50,Chequing,Walmart,Household,93.333333


In [53]:
cleaned_df.loc[cleaned_df['Matched_Name'].notna(), 'TRANSACTIONS'] = cleaned_df['Matched_Name']
cleaned_df.drop(columns=['Matched_Name', 'Score'], inplace=True)
cleaned_df = cleaned_df[['Date', 'Account', 'TRANSACTIONS', 'Category', 'Funds']]

In [54]:
cleaned_df

Unnamed: 0,Date,Account,TRANSACTIONS,Category,Funds
0,2023-01-22,CIBC Credit Card,Personal & Household ExpensesROGERS ******7647...,,-35.00
1,2023-02-18,CIBC Credit Card,Personal & Household ExpensesROGERS ******7647...,,-35.50
2,2023-02-22,CIBC Credit Card,"Hotels, Entertainment, and RecreationEVENTBRIT...",,-27.54
3,2023-02-24,CIBC Credit Card,"Hotels, Entertainment, and RecreationEVENTBRIT...",,27.54
4,2023-03-03,CIBC Credit Card,PAYMENT THANK YOU/PAIEMEN T MERCI,,35.00
...,...,...,...,...,...
303,2025-05-23,Chequing,SEND Divyaa Barrie,,-44.57
305,2025-05-26,Chequing,ESSO CIRCLE K,,-14.97
306,2025-05-26,Chequing,RECEIVE BEULAH CHR ISRAEL,,200.00
307,2025-05-30,Chequing,Walmart,Household,-28.50


### Adding Running balance

In [55]:
cleaned_df["Running Balance"] = cleaned_df["Funds"].cumsum() + last_updated_balance

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["Running Balance"] = cleaned_df["Funds"].cumsum() + last_updated_balance


In [56]:
latest_balance = cleaned_df['Running Balance'].iloc[-1]

In [57]:
cleaned_df["Income/Expense"] = cleaned_df["Funds"].apply(lambda x: "Expense" if x < 0 else "Income")
cleaned_df = cleaned_df.rename(columns= {
    "TRANSACTIONS": "Note",
    "Funds": "Amount"
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["Income/Expense"] = cleaned_df["Funds"].apply(lambda x: "Expense" if x < 0 else "Income")


In [58]:
cleaned_df = cleaned_df[["Date", "Account", "Category", "Note", "Income/Expense", "Amount"]]
cleaned_df

Unnamed: 0,Date,Account,Category,Note,Income/Expense,Amount
0,2023-01-22,CIBC Credit Card,,Personal & Household ExpensesROGERS ******7647...,Expense,-35.00
1,2023-02-18,CIBC Credit Card,,Personal & Household ExpensesROGERS ******7647...,Expense,-35.50
2,2023-02-22,CIBC Credit Card,,"Hotels, Entertainment, and RecreationEVENTBRIT...",Expense,-27.54
3,2023-02-24,CIBC Credit Card,,"Hotels, Entertainment, and RecreationEVENTBRIT...",Income,27.54
4,2023-03-03,CIBC Credit Card,,PAYMENT THANK YOU/PAIEMEN T MERCI,Income,35.00
...,...,...,...,...,...,...
303,2025-05-23,Chequing,,SEND Divyaa Barrie,Expense,-44.57
305,2025-05-26,Chequing,,ESSO CIRCLE K,Expense,-14.97
306,2025-05-26,Chequing,,RECEIVE BEULAH CHR ISRAEL,Income,200.00
307,2025-05-30,Chequing,Household,Walmart,Expense,-28.50


# Data Export

In [None]:
cleaned_df.to_excel("Final.xlsx", index=False)

In [None]:
# from openpyxl import load_workbook
# from openpyxl.utils.dataframe import dataframe_to_rows

# cleaned_df = pd.read_excel(r"C:\Users\sanja\Documents\Projects\API\Final.xlsx")
# filename = r'C:\Users\sanja\Documents\Projects\API\MM.xlsx'
# sheet_name = 'Sheet1' 
# table_name = 'MyTable'  

# wb = load_workbook(filename)
# ws = wb[sheet_name]


# table = ws.tables[table_name]
# table_range = table.ref 
# start_cell, end_cell = table_range.split(':')
# last_row = ws[end_cell].row

# for row in dataframe_to_rows(cleaned_df, index=False, header=False):
#     ws.append(row)

# new_end_row = last_row + len(cleaned_df)
# table.ref = f"{start_cell.split(':')[0]}:{end_cell[:1]}{new_end_row}"

# wb.save(filename)

### Saving the Latest date and latest balance to text file

In [None]:
latest_date = cleaned_df['Date'].max().strptime('%Y-%m-%d')

with open('last_updated.txt', 'w') as f:
    f.write(latest_date.isoformat() + '\n')
    f.write(str(latest_balance))