In [3]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set seed for reproducibility
np.random.seed(42)

# Generate synthetic data
num_rows = 300

# Generate Transaction IDs and Customer IDs
transaction_ids = np.arange(1001, 1001 + num_rows)
customer_ids = np.random.randint(2001, 2500, num_rows)

# Generate Customer Names with more missing values
customer_names = np.random.choice(["Alice", "Bob", "Charlie", "David", "Emma", None], num_rows,
                                  p=[0.15, 0.15, 0.15, 0.15, 0.1, 0.3])  # More missing values

# Generate Purchase Amounts with outliers and text values
purchase_amounts = np.random.uniform(5, 500, num_rows).astype(str)  # Stored as strings initially
purchase_amounts[random.randint(0, num_rows-1)] = None  # Add missing value
purchase_amounts[random.randint(0, num_rows-1)] = "Ten Dollars"  # Add non-numeric value
purchase_amounts[random.randint(0, num_rows-1)] = "10000"  # Add extreme outlier

# Generate Transaction Dates with more inconsistencies
transaction_dates = [(datetime.today() - timedelta(days=random.randint(0, 365))).strftime(
    random.choice(["%Y-%m-%d", "%d/%m/%Y", "%m-%d-%Y", "%B %d, %Y"])) for _ in range(num_rows)]
transaction_dates[random.randint(0, num_rows-1)] = None  # Add missing value
transaction_dates[random.randint(0, num_rows-1)] = "January 32, 2023"  # Add incorrect date
transaction_dates[random.randint(0, num_rows-1)] = "2027-12-25"  # Add future date

# Generate Product Categories with typos
product_categories = np.random.choice(["Electronics", "Clothing", "Books", "Home & Kitchen", "Beauty",
                                       "Elctronics", "Clothng", "Hme & Kitchen"], num_rows)  # Some typos

# Generate Payment Methods with inconsistencies
payment_methods = np.random.choice(["Credit Card", "credit_card", "Paypal", "PayPal", "BANK_TRANSFER",
                                    "bank transfer", "banktransfer", "paypal "], num_rows)  # Some extra spaces and variations

# Introduce more duplicate transactions
duplicates = 10  # Number of duplicate rows to add
duplicate_indices = np.random.choice(num_rows, duplicates, replace=False)

# Create DataFrame
data = {
    "Transaction_ID": transaction_ids.tolist(),
    "Customer_ID": customer_ids.tolist(),
    "Customer Name": customer_names.tolist(),
    "Purchase Amount ($)": purchase_amounts,
    "Transaction Date": transaction_dates,
    "Product Category": product_categories.tolist(),
    "Payment Method": payment_methods.tolist()
}

df_messy = pd.DataFrame(data)

# Add duplicate rows
df_messy = pd.concat([df_messy, df_messy.iloc[duplicate_indices]], ignore_index=True)

# Save dataset
df_messy.to_csv("messy_transaction_data_v2.csv", index=False)

print("Dataset saved as 'messy_transaction_data_v2.csv'")


Dataset saved as 'messy_transaction_data_v2.csv'


In [4]:
df = pd.read_csv("messy_transaction_data_v2.csv")
df.head()
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Transaction_ID       310 non-null    int64 
 1   Customer_ID          310 non-null    int64 
 2   Customer Name        218 non-null    object
 3   Purchase Amount ($)  309 non-null    object
 4   Transaction Date     309 non-null    object
 5   Product Category     310 non-null    object
 6   Payment Method       310 non-null    object
dtypes: int64(2), object(5)
memory usage: 17.1+ KB


In [5]:
df.columns

Index(['Transaction_ID', 'Customer_ID', 'Customer Name', 'Purchase Amount ($)',
       'Transaction Date', 'Product Category', 'Payment Method'],
      dtype='object')

# 1️⃣ Handling Missing Values

In [6]:
# Check for missing values in the dataset.
df.isnull().sum()

Unnamed: 0,0
Transaction_ID,0
Customer_ID,0
Customer Name,92
Purchase Amount ($),1
Transaction Date,1
Product Category,0
Payment Method,0


In [7]:
# Fill missing values in Customer Name with "Unknown".
df["Customer Name"].fillna("Unknown", inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Customer Name"].fillna("Unknown", inplace=True)


Unnamed: 0,0
Transaction_ID,0
Customer_ID,0
Customer Name,0
Purchase Amount ($),1
Transaction Date,1
Product Category,0
Payment Method,0


In [31]:
def to_snake_case(column_name):
    snake_chars = []
    prev_underscore = False  # Track if previous character was an underscore

    for char in column_name:
        lower_char = char.lower()

        # Keep letters, numbers, and existing underscores
        if lower_char.isalnum() or lower_char == '_':
            if lower_char == '_':
                # Add underscore only if previous wasn't an underscore
                if not prev_underscore:
                    snake_chars.append('_')
                    prev_underscore = True
            else:
                snake_chars.append(lower_char)
                prev_underscore = False
        # Replace all other characters with underscores
        else:
            if not prev_underscore:
                snake_chars.append('_')
                prev_underscore = True

    # Join characters and clean up edges
    result = ''.join(snake_chars).strip('_')
    # Handle empty strings from names with only special characters
    return result if result else '_'

# Apply the conversion to all column names
df.columns = [to_snake_case(col) for col in df.columns]
df.columns

Index(['transaction_id', 'customer_id', 'customer_name', 'purchase_amount',
       'transaction_date', 'product_category', 'payment_method'],
      dtype='object')