In [54]:
import pandas as pd
import numpy as np
import re


In [31]:
raw_order_data = {
    'OrderID': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'CustomerID': ['C001', 'C002', 'C003', 'C001', 'C004',
                   'C005', 'C006', 'C007', 'C008', 'C009'],
    'OrderDate': ['2023-01-15', '2023-02-20', '2023-03-10', '2023-01-10', '2023-04-05',
                  '2023-02-28', '2023-07-01', '2023-06-15', '2023-05-20', '2023-08-01'],
    'ProductCategory': ['Electronics', 'Books', 'Food', 'Electronics', 'Apparel',
                        'Books', 'Electronics', 'Food', 'InvalidCategory', 'Apparel'], 
    'Quantity': [1, 2, 0, 3, 1, 5, -2, 1, 4, 2], 
    'UnitPrice': [150.00, 25.50, 10.00, 150.00, 50.00, 25.50, 200.00, 12.00, 30.00, 45.00],
    'TotalPrice': [150.00, 51.00, 0.00, 450.00, 50.00, 127.50, -400.00, 12.00, 120.00, 45.00], 
    'PaymentStatus': ['Paid', 'Pending', 'Paid', 'Paid', 'Pending',
                      'Completed', 'Paid', 'Pending', 'Paid', 'Refunded'], 
    'DeliveryDate': ['2023-01-20', '2023-02-25', '2023-03-12', '2023-01-12', '2023-04-10',
                     '2023-02-27', '2023-06-25', '2023-06-10', '2023-05-25', '2023-07-25'] 
}

df_orders = pd.DataFrame(raw_order_data)

In [None]:
df_orders['OrderDate'] = pd.to_datetime(df_orders['OrderDate'])
df_orders['DeliveryDate'] = pd.to_datetime(df_orders['DeliveryDate'])

df_clean = df_orders.copy()

print("--- The Initial Case File ---")
print(df_clean.to_markdown(index=False))

--- The Initial Case File ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       101 | C001         | 2023-01-15 00:00:00 | Electronics       |          1 |       150   |        150   | Paid            | 2023-01-20 00:00:00 |
|       102 | C002         | 2023-02-20 00:00:00 | Books             |          2 |        25.5 |         51   | Pending         | 2023-02-25 00:00:00 |
|       103 | C003         | 2023-03-10 00:00:00 | Food              |          0 |        10   |          0   | Paid            | 2023-03-12 00:00:00 |
|       104 | C001         | 2023-01-10 00:00:00 | Electronics       |          3 |       150   |        450   | Paid            | 2023-01-12 00:00:00 |
|       105 | C004         | 2023-04-05 00:00:00 | A

In [33]:
non_positive_quantity_mask = df_clean['Quantity'] <= 0
print("\n--- Lies Found: Non-Positive Quantities ---")
print(df_clean[non_positive_quantity_mask].to_markdown(index=False))


--- Lies Found: Non-Positive Quantities ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       103 | C003         | 2023-03-10 00:00:00 | Food              |          0 |          10 |            0 | Paid            | 2023-03-12 00:00:00 |
|       107 | C006         | 2023-07-01 00:00:00 | Electronics       |         -2 |         200 |         -400 | Paid            | 2023-06-25 00:00:00 |


In [34]:
non_positive_total_price_mask = df_clean['TotalPrice'] <= 0
print("\n--- Lies Found: Non-Positive Total Prices ---")
print(df_clean[non_positive_total_price_mask].to_markdown(index=False))


--- Lies Found: Non-Positive Total Prices ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       103 | C003         | 2023-03-10 00:00:00 | Food              |          0 |          10 |            0 | Paid            | 2023-03-12 00:00:00 |
|       107 | C006         | 2023-07-01 00:00:00 | Electronics       |         -2 |         200 |         -400 | Paid            | 2023-06-25 00:00:00 |


In [37]:
df_clean = df_orders.copy() 
df_clean.loc[df_clean['Quantity'] <= 0, 'Quantity'] = np.nan
df_clean.loc[df_clean['TotalPrice'] <= 0, 'TotalPrice'] = np.nan
print("\n--- Our Data After the Boundary Sentinel's Corrections ---")
print(df_clean.to_markdown(index=False))


--- Our Data After the Boundary Sentinel's Corrections ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       101 | C001         | 2023-01-15 00:00:00 | Electronics       |          1 |       150   |        150   | Paid            | 2023-01-20 00:00:00 |
|       102 | C002         | 2023-02-20 00:00:00 | Books             |          2 |        25.5 |         51   | Pending         | 2023-02-25 00:00:00 |
|       103 | C003         | 2023-03-10 00:00:00 | Food              |        nan |        10   |        nan   | Paid            | 2023-03-12 00:00:00 |
|       104 | C001         | 2023-01-10 00:00:00 | Electronics       |          3 |       150   |        450   | Paid            | 2023-01-12 00:00:00 |
|       105 | C004    

In [38]:
valid_product_categories = ['Electronics', 'Books', 'Food', 'Apparel']
valid_payment_statuses = ['Paid', 'Pending', 'Refunded']

print("\n--- Our Official Product Categories Dictionary ---")
print(valid_product_categories)
print("\n--- Our Official Payment Status Dictionary ---")
print(valid_payment_statuses)


--- Our Official Product Categories Dictionary ---
['Electronics', 'Books', 'Food', 'Apparel']

--- Our Official Payment Status Dictionary ---
['Paid', 'Pending', 'Refunded']


In [None]:
invalid_category_mask = ~df_clean['ProductCategory'].\
                         isin(valid_product_categories)

print("\n--- ProductCategory entries not in our official dictionary ---")
print(df_clean[invalid_category_mask].to_markdown(index=False))


--- ProductCategory entries not in our official dictionary ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       109 | C008         | 2023-05-20 00:00:00 | InvalidCategory   |          4 |          30 |          120 | Paid            | 2023-05-25 00:00:00 |


In [None]:
invalid_status_mask = ~df_clean['PaymentStatus'].\
                       isin(valid_payment_statuses)

print("\n--- PaymentStatus entries not in our official dictionary ---")
print(df_clean[invalid_status_mask].to_markdown(index=False))


--- PaymentStatus entries not in our official dictionary ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       106 | C005         | 2023-02-28 00:00:00 | Books             |          5 |        25.5 |        127.5 | Completed       | 2023-02-27 00:00:00 |


In [42]:
df_clean.loc[df_clean['ProductCategory'] == 
            'InvalidCategory', 'ProductCategory'] = np.nan

In [None]:
df_clean['PaymentStatus'] = df_clean['PaymentStatus'].\
                            replace('Completed', 'Paid')

In [44]:
print("\n--- Our Data After the Interpreter's Corrections ---")
print(df_clean.to_markdown(index=False))
print("\n--- Final Payment Status Counts (Unified) ---")
print(df_clean['PaymentStatus'].value_counts().to_markdown())


--- Our Data After the Interpreter's Corrections ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       101 | C001         | 2023-01-15 00:00:00 | Electronics       |          1 |       150   |        150   | Paid            | 2023-01-20 00:00:00 |
|       102 | C002         | 2023-02-20 00:00:00 | Books             |          2 |        25.5 |         51   | Pending         | 2023-02-25 00:00:00 |
|       103 | C003         | 2023-03-10 00:00:00 | Food              |        nan |        10   |        nan   | Paid            | 2023-03-12 00:00:00 |
|       104 | C001         | 2023-01-10 00:00:00 | Electronics       |          3 |       150   |        450   | Paid            | 2023-01-12 00:00:00 |
|       105 | C004         |

In [None]:
calculated_total_price = df_clean['Quantity'] * df_clean['UnitPrice']
is_consistent_total_price = np.isclose(df_clean['TotalPrice'],
                                       calculated_total_price, 
                                       equal_nan=True)

print("\n--- The Master Logician's Report "
      "(True = Consistent, False = Inconsistent) ---")
print(is_consistent_total_price)

inconsistent_total_price_rows = df_clean[~is_consistent_total_price]
print("\n--- The False Alibis: Rows with Inconsistent Totals ---")
print(inconsistent_total_price_rows.to_markdown(index=False))


--- The Master Logician's Report (True = Consistent, False = Inconsistent) ---
[ True  True  True  True  True  True  True  True  True False]

--- The False Alibis: Rows with Inconsistent Totals ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       110 | C009         | 2023-08-01 00:00:00 | Apparel           |          2 |          45 |           45 | Refunded        | 2023-07-25 00:00:00 |


In [49]:
is_consistent_total_price

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False])

In [None]:
df_clean.loc[df_clean['OrderID'] == 110, 'TotalPrice'] = \
df_clean.loc[df_clean['OrderID'] == 110, 'Quantity'] * \
df_clean.loc[df_clean['OrderID'] == 110, 'UnitPrice']

print("\n--- Our Data After the Master Logician's Corrections ---")
print(df_clean.to_markdown(index=False))


--- Our Data After the Master Logician's Corrections (after 3.6.3) ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       101 | C001         | 2023-01-15 00:00:00 | Electronics       |          1 |       150   |        150   | Paid            | 2023-01-20 00:00:00 |
|       102 | C002         | 2023-02-20 00:00:00 | Books             |          2 |        25.5 |         51   | Pending         | 2023-02-25 00:00:00 |
|       103 | C003         | 2023-03-10 00:00:00 | Food              |        nan |        10   |        nan   | Paid            | 2023-03-12 00:00:00 |
|       104 | C001         | 2023-01-10 00:00:00 | Electronics       |          3 |       150   |        450   | Paid            | 2023-01-12 00:00:00 |
|       10

In [51]:
time_paradox_mask = df_clean['DeliveryDate'] < df_clean['OrderDate']

print("\n--- The Timekeeper's Report: Rows with Paradoxical Dates ---")
print(df_clean[time_paradox_mask].to_markdown(index=False))


--- The Timekeeper's Report: Rows with Paradoxical Dates ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       106 | C005         | 2023-02-28 00:00:00 | Books             |          5 |        25.5 |        127.5 | Paid            | 2023-02-27 00:00:00 |
|       107 | C006         | 2023-07-01 00:00:00 | Electronics       |        nan |       200   |        nan   | Paid            | 2023-06-25 00:00:00 |
|       108 | C007         | 2023-06-15 00:00:00 | Food              |          1 |        12   |         12   | Pending         | 2023-06-10 00:00:00 |
|       110 | C009         | 2023-08-01 00:00:00 | Apparel           |          2 |        45   |         90   | Refunded        | 2023-07-25 00:00:00 |


In [None]:
df_clean.loc[time_paradox_mask, 'DeliveryDate'] = np.nan
print("\n--- Our Data After the Timekeeper's Corrections ---")
print(df_clean.to_markdown(index=False))


--- Our Data After the Timekeeper's Corrections (after 3.6.4) ---
|   OrderID | CustomerID   | OrderDate           | ProductCategory   |   Quantity |   UnitPrice |   TotalPrice | PaymentStatus   | DeliveryDate        |
|----------:|:-------------|:--------------------|:------------------|-----------:|------------:|-------------:|:----------------|:--------------------|
|       101 | C001         | 2023-01-15 00:00:00 | Electronics       |          1 |       150   |        150   | Paid            | 2023-01-20 00:00:00 |
|       102 | C002         | 2023-02-20 00:00:00 | Books             |          2 |        25.5 |         51   | Pending         | 2023-02-25 00:00:00 |
|       103 | C003         | 2023-03-10 00:00:00 | Food              |        nan |        10   |        nan   | Paid            | 2023-03-12 00:00:00 |
|       104 | C001         | 2023-01-10 00:00:00 | Electronics       |          3 |       150   |        450   | Paid            | 2023-01-12 00:00:00 |
|       105 | C

In [55]:
email_data = {
    'CustomerID': ['C1001', 'C1002', 'C1003', 'C1004', 'C1005'],
    'EmailAddress': ['user1@email.com', 'user2@domain',
                     'invalid-email.com', 'user_three@web.org', 
                     'user4@email@corp.net']
}
df_emails = pd.DataFrame(email_data)

print("\n--- The Unverified Email List ---")
print(df_emails.to_markdown(index=False))

email_pattern = r'^\S+@\S+\.\S+$'
is_valid_email = df_emails['EmailAddress'].str.match(email_pattern)

print("\n--- The Validator's Report (True = Valid, False = Invalid) ---")
print(is_valid_email.to_markdown())

invalid_emails_found = df_emails[~is_valid_email]
print("\n--- The Malformed Entries ---")
print(invalid_emails_found.to_markdown(index=False))


--- The Unverified Email List ---
| CustomerID   | EmailAddress         |
|:-------------|:---------------------|
| C1001        | user1@email.com      |
| C1002        | user2@domain         |
| C1003        | invalid-email.com    |
| C1004        | user_three@web.org   |
| C1005        | user4@email@corp.net |

--- The Validator's Report (True = Valid, False = Invalid) ---
|    |   EmailAddress |
|---:|---------------:|
|  0 |              1 |
|  1 |              0 |
|  2 |              0 |
|  3 |              1 |
|  4 |              1 |

--- The Malformed Entries ---
| CustomerID   | EmailAddress      |
|:-------------|:------------------|
| C1002        | user2@domain      |
| C1003        | invalid-email.com |


In [56]:
email_pattern = r'^\S+@\S+\.\S+$'
is_valid_email = df_emails['EmailAddress'].str.match(email_pattern)

print("\n--- The Validator's Report (True = Valid, False = Invalid) ---")
print(is_valid_email.to_markdown())



--- The Validator's Report (True = Valid, False = Invalid) ---
|    |   EmailAddress |
|---:|---------------:|
|  0 |              1 |
|  1 |              0 |
|  2 |              0 |
|  3 |              1 |
|  4 |              1 |


In [57]:
invalid_emails_found = df_emails[~is_valid_email]
print("\n--- The Malformed Entries ---")
print(invalid_emails_found.to_markdown(index=False))


--- The Malformed Entries ---
| CustomerID   | EmailAddress      |
|:-------------|:------------------|
| C1002        | user2@domain      |
| C1003        | invalid-email.com |


In [58]:
df_emails_clean = df_emails.copy()
invalid_email_mask_to_apply = ~df_emails_clean['EmailAddress'].str.match(email_pattern)
df_emails_clean.loc[invalid_email_mask_to_apply, 'EmailAddress'] = np.nan
print("\n--- Our Data After the Format Enforcer's Corrections (after 3.6.5) ---")
print(df_emails_clean.to_markdown(index=False))


--- Our Data After the Format Enforcer's Corrections (after 3.6.5) ---
| CustomerID   | EmailAddress         |
|:-------------|:---------------------|
| C1001        | user1@email.com      |
| C1002        | nan                  |
| C1003        | nan                  |
| C1004        | user_three@web.org   |
| C1005        | user4@email@corp.net |


In [60]:
df_emails_clean = df_emails.copy()

invalid_email_mask_to_apply = \
~df_emails_clean['EmailAddress'].str.match(email_pattern)

df_emails_clean.loc[invalid_email_mask_to_apply, 'EmailAddress'] = np.nan
print("\n--- Our Data After the Format Enforcer's Corrections ---")
print(df_emails_clean.to_markdown(index=False))


--- Our Data After the Format Enforcer's Corrections ---
| CustomerID   | EmailAddress         |
|:-------------|:---------------------|
| C1001        | user1@email.com      |
| C1002        | nan                  |
| C1003        | nan                  |
| C1004        | user_three@web.org   |
| C1005        | user4@email@corp.net |
