# Financial Transactions Data Cleaning

In [2]:
import pandas as pd
import numpy as np

## Load Data

In [6]:
data = pd.read_csv('../data/raw/dirty_financial_transactions.csv')
df = data.copy()

## Remove Duplicates and Missing Values

In [7]:
df.drop_duplicates(inplace=True)
df.replace(['', 'NA', 'N/A', 'null', None], np.nan, inplace=True)
df.dropna(inplace=True)

## Standardize Payment Method

In [8]:
df['Payment_Method'] = df['Payment_Method'].str.strip().str.lower().str.replace(' ', '', regex=False)
final_standard = {
    'creditcard': 'credit_card',
    'paypal': 'paypal',
    'cash': 'cash',
}
df['Payment_Method'] = df['Payment_Method'].replace(final_standard)

## Clean Price and Quantity

In [9]:
df['Price'] = df['Price'].astype(str).str.replace('$', '', regex=False).astype(float)
df['Price'] = np.abs(df['Price'])
df['Quantity'] = np.abs(df['Quantity'])

## Fix Product Names

In [10]:
from rapidfuzz import process
correct_names = ['Tablet', 'Laptop', 'Coffee Machine', 'Smartphone', 'Headphones']
def fix_name(name):
    match, score, _ = process.extractOne(name, correct_names)
    return match if score > 50 else name
df['Product_Name'] = df['Product_Name'].apply(fix_name)
df['Product_Name'] = df['Product_Name'].str.title()

## Standardize Transaction Status

In [11]:
df['Transaction_Status'] = df['Transaction_Status'].str.strip().str.title()

## Drop Duplicate Transaction IDs

In [12]:
df = df.drop_duplicates(subset=['Transaction_ID'], keep='first')

## Product Counts

In [13]:
product_counts = df['Product_Name'].value_counts().reset_index()
product_counts.columns = ['Product_Name', 'Count']
product_counts

Unnamed: 0,Product_Name,Count
0,Tablet,9151
1,Smartphone,8969
2,Coffee Machine,8949
3,Laptop,8938
4,Headphones,8898


## Save Cleaned Data

In [15]:
import os
output_path = '../data/processed/cleaned_financial_transactions.csv'
if os.path.exists(output_path):
    os.remove(output_path)
df.to_csv(output_path, index=False)

## Data Shapes

In [17]:
data.shape

(100000, 8)

In [18]:
df.shape

(44905, 8)