## Imports

In [18]:
import pandas as pd
import numpy as np
import random


## Loading Raw Dataset

In [19]:
RAW_DATA_PATH = "raw_dataset.csv"

df = pd.read_csv(RAW_DATA_PATH)

print("Loaded dataset:", df.shape)
df.head()

Loaded dataset: (10002, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


## Defining my Errors

In [20]:
ERROR_TYPES = [
    "missing_value",
    "duplicate_row",
    "duplicate_id",
    "categorical_error",
    "value_error",
    "format_error",
    "out_of_range",
]

ERROR_FRACTION = 0.25
print("Error types defined")


Error types defined


## Error Injection

In [21]:
def inject_errors(df: pd.DataFrame):

    df = df.copy()

    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    df[num_cols] = df[num_cols].astype("object")

    n_rows = len(df)

    n_modify = max(1, int(n_rows * ERROR_FRACTION))
    rows_to_modify = random.sample(range(n_rows), n_modify)

    id_col = None
    for col in df.columns:
        if "id" in col.lower():
            id_col = col
            break

    for idx in rows_to_modify:

        n_errors = random.randint(1, 3)

        for _ in range(n_errors):

            error_type = random.choice(ERROR_TYPES)

            if error_type == "missing_value":
                col = random.choice(df.columns)
                df.loc[idx, col] = None

            elif error_type == "duplicate_row":
                duplicate_row = df.loc[idx].copy()
                df.loc[len(df)] = duplicate_row

            elif error_type == "duplicate_id" and id_col is not None:
                random_row = random.randint(0, len(df)-1)
                df.loc[idx, id_col] = df.loc[random_row, id_col]

            elif error_type == "categorical_error":
                cat_cols = df.select_dtypes(include=["object"]).columns
                if len(cat_cols) > 0:
                    col = random.choice(cat_cols)
                    df.loc[idx, col] = random.choice(["INVALID", "UNKNOWN", "###"])

            elif error_type == "value_error":
                if len(num_cols) > 0:
                    col = random.choice(num_cols)
                    df.loc[idx, col] = "not_a_number"

            elif error_type == "format_error":
                col = random.choice(df.columns)
                df.loc[idx, col] = f"ERR_{random.randint(100,999)}"

            elif error_type == "out_of_range":
                if len(num_cols) > 0:
                    col = random.choice(num_cols)
                    df.loc[idx, col] = random.choice([-9999, 99999, -42])

    return df


## Applying Error Injection

In [22]:
df_with_errors = inject_errors(df)

print("Original shape:", df.shape)
print("Shape after injecting errors:", df_with_errors.shape)

df_with_errors.head()


Original shape: (10002, 14)
Shape after injecting errors: (10743, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15809838,Onio,502,France,Female,42.0,8,ERR_711,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


## Saving the New Dataset with errors

In [23]:
OUTPUT_PATH = "dataset_with_errors.csv"

df_with_errors.to_csv(OUTPUT_PATH, index=False)
print("Saved dataset with injected errors", OUTPUT_PATH)


Saved dataset with injected errors dataset_with_errors.csv


## Error types injected

In [24]:
print("Checking for possible errors...\n")

print(" Missing values per column:")
display(df_with_errors.isnull().sum())

print("\n Duplicate rows:")
print(df_with_errors.duplicated().sum())

id_cols = [c for c in df.columns if "id" in c.lower()]
if id_cols:
    id_col = id_cols[0]
    print(f"\n Duplicate IDs in '{id_col}':")
    print(df_with_errors[id_col].duplicated().sum())
else:
    print("\n(No ID column detected for duplicate ID check)")


Checking for possible errors...

 Missing values per column:


RowNumber          44
CustomerId         45
Surname            51
CreditScore        65
Geography          58
Gender             53
Age                38
Tenure             50
Balance            53
NumOfProducts      57
HasCrCard          52
IsActiveMember     43
EstimatedSalary    55
Exited             49
dtype: int64


 Duplicate rows:
411

 Duplicate IDs in 'CustomerId':
1451
