In [3]:
# 1. Load the Data

In [13]:
!pip install pandas timedelta

Collecting timedelta
  Using cached timedelta-2020.12.3.tar.gz (1.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: timedelta
  Building wheel for timedelta (setup.py) ... [?25ldone
[?25h  Created wheel for timedelta: filename=timedelta-2020.12.3-py3-none-any.whl size=1556 sha256=d5085747c1a719f3cf7f9834d95990a44deb6e68230c5fca804c95c8a348a0ab
  Stored in directory: /Users/raghulrajkumar/Library/Caches/pip/wheels/6a/d4/2e/22908853a465dbeae5d67583a77bacaa0aba24288e7778f840
Successfully built timedelta
Installing collected packages: timedelta
Successfully installed timedelta-2020.12.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import pandas as pd

# Load the Excel file
file_path = "./data/data.xlsx"
xls = pd.ExcelFile(file_path)

# Load each sheet into a dictionary of DataFrames
sheets = {sheet_name: xls.parse(sheet_name) for sheet_name in xls.sheet_names}

# Check data structure
for sheet_name, df in sheets.items():
    print(f"Sheet: {sheet_name}")
    print(df.head(), "\n")


Sheet: Example 1 Ashby
   Include or Exclude?  Start Date  End Date  Notes       date  \
0                  NaN         NaN       NaN    NaN 2022-06-15   
1                  NaN         NaN       NaN    NaN 2022-06-16   
2                  NaN         NaN       NaN    NaN 2022-07-31   
3                  NaN         NaN       NaN    NaN 2022-08-31   
4                  NaN         NaN       NaN    NaN 2022-09-30   

   line_amount_usd    record_type  \
0          7580.00       purchase   
1          7580.00    transaction   
2           631.67  journal_entry   
3           631.67  journal_entry   
4           631.67  journal_entry   

                                        account_name  \
0     Other Current Assets:Prepaids:Prepaid Expenses   
1                                                NaN   
2  General and Administrative Expenses:Profession...   
3  General and Administrative Expenses:Profession...   
4  General and Administrative Expenses:Profession...   

                  ac

In [6]:
# 2. Identify and Exclude Duplicates

In [7]:
def remove_duplicates(df):
    df['duplicate_flag'] = df.duplicated(subset=['date', 'line_amount_usd', 'memo'], keep=False)
    
    # Prioritize by integration: Bill.com > QuickBooks (QBO) > Brex
    integration_priority = {'bill.com': 3, 'quickbooks': 2, 'brex': 1}
    df['integration_score'] = df['integration'].map(integration_priority).fillna(0)

    df = df.sort_values(by=['duplicate_flag', 'integration_score'], ascending=[False, False])
    
    # Keep only one record of each duplicate set
    df = df.drop_duplicates(subset=['date', 'line_amount_usd', 'memo'], keep='first')
    
    return df.drop(columns=['duplicate_flag', 'integration_score'])

# Apply to each sheet
sheets = {name: remove_duplicates(df) for name, df in sheets.items()}


In [8]:
# 3. Exclude Journal Entries (Amortization)

In [9]:
def exclude_journal_entries(df):
    df['Include/Exclude'] = df['record_type'].apply(lambda x: "Exclude" if x == "journal_entry" else "Include")
    return df

sheets = {name: exclude_journal_entries(df) for name, df in sheets.items()}


In [10]:
# 4. Assign Start and End Dates

In [15]:
from datetime import datetime, timedelta

def assign_dates(df):
    for index, row in df.iterrows():
        if row['Include/Exclude'] == "Include":
            memo = str(row['memo']) if pd.notna(row['memo']) else ""  # Ensure it's a string

            if '-' in memo:  # Check if memo contains a date range
                parts = memo.split('-')
                try:
                    df.at[index, 'Start Date'] = pd.to_datetime(parts[0].strip(), errors='coerce')
                    df.at[index, 'End Date'] = pd.to_datetime(parts[1].strip(), errors='coerce')
                except Exception as e:
                    print(f"Error parsing dates for row {index}: {e}")
                    df.at[index, 'Start Date'] = row['date']
                    df.at[index, 'End Date'] = row['date'] + timedelta(days=30)

            else:
                # Default assumption: start from transaction date, assume 1-month service period
                df.at[index, 'Start Date'] = row['date']
                df.at[index, 'End Date'] = row['date'] + timedelta(days=30)
        else:
            df.at[index, 'Start Date'] = None
            df.at[index, 'End Date'] = None

    return df

# Apply the function
sheets = {name: assign_dates(df) for name, df in sheets.items()}


In [16]:
# 5. Justification for Inclusion/Exclusion

In [17]:
def add_notes(df):
    df['Notes'] = df.apply(lambda row: 
        "Excluded due to duplicate or amortization." if row['Include/Exclude'] == "Exclude" else 
        "Included based on transaction details and inferred service period.", axis=1)
    return df

sheets = {name: add_notes(df) for name, df in sheets.items()}


In [18]:
# 6. Save the Processed Data

In [19]:
output_file = "./data/Cleaned_Data_QA_Specialist.xlsx"
with pd.ExcelWriter(output_file) as writer:
    for name, df in sheets.items():
        df.to_excel(writer, sheet_name=name, index=False)


Data Cleaning Completed. Saved as: Cleaned_Data_QA_Specialist_Final.xlsx
