In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.preprocessing import LabelEncoder

In [4]:
fake = Faker()
np.random.seed(42)

In [5]:
n_customers = 1000

# Generate demographic features
customer_ids = [fake.uuid4() for _ in range(n_customers)]
ages = np.random.randint(18, 70, size=n_customers)
genders = np.random.choice(['Male', 'Female'], size=n_customers)
incomes = np.random.normal(50000, 15000, size=n_customers).astype(int)
locations = [fake.city() for _ in range(n_customers)]

In [6]:
# Generate behavioral features
website_visits = np.random.randint(1, 100, size=n_customers)
pages_viewed = np.random.randint(1, 20, size=n_customers)
time_on_site = np.random.exponential(scale=5, size=n_customers) * website_visits

In [7]:
email_opened = np.random.choice([0, 1], size=n_customers, p=[0.7, 0.3])
ad_clicked = np.random.choice([0, 1], size=n_customers, p=[0.8, 0.2])

In [8]:
purchase_made = np.random.choice([0, 1], size=n_customers, p=[0.6, 0.4])
purchase_amount = purchase_made * (np.random.normal(100, 50, size=n_customers)).clip(min=0).round(2)

In [9]:
df = pd.DataFrame({
    'customer_id': customer_ids,
    'age': ages,
    'gender': genders,
    'income': incomes,
    'location': locations,
    'website_visits': website_visits,
    'pages_viewed': pages_viewed,
    'time_on_site': time_on_site.round(2),
    'email_opened': email_opened,
    'ad_clicked': ad_clicked,
    'purchase_made': purchase_made,
    'purchase_amount': purchase_amount
})

In [10]:
df

Unnamed: 0,customer_id,age,gender,income,location,website_visits,pages_viewed,time_on_site,email_opened,ad_clicked,purchase_made,purchase_amount
0,725a53c1-1f4c-4d87-8fff-251761ce9cbc,56,Male,49753,North Angela,11,6,53.62,1,0,0,0.00
1,1b1bcc8f-c319-4986-b1c3-cd5616430996,69,Male,67825,Port Dianehaven,49,11,61.72,0,1,0,0.00
2,c4f798f7-5218-40e4-bd63-262cbbbc8ec6,46,Male,87903,Lake Samuelbury,38,7,69.74,0,0,1,71.52
3,5cb0e533-b74d-4881-aec1-e3569ce1ede4,32,Female,42036,Port Melissashire,64,15,253.01,0,0,0,0.00
4,3b4333f9-67c2-4120-bca1-34864c8fa259,60,Male,42658,Lake Sydney,69,6,186.27,1,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
995,414d1511-f5b8-4348-9817-a5056b81437a,60,Female,61694,Port Lindafurt,35,18,45.05,0,0,1,67.84
996,3394d5a2-f625-4d98-9f01-0b0305f7ac39,64,Male,69654,South Stacey,80,17,544.88,0,0,1,134.95
997,18d65eae-1d1a-4854-97b9-4b4c65cd395d,62,Female,70935,Nathantown,2,9,25.16,1,0,0,0.00
998,5cf15372-eef3-40c1-968e-fd538b029ba1,35,Male,41567,Susanshire,24,9,66.15,0,0,1,94.12


In [13]:
import numpy as np
import pandas as pd

##Create a class w/ number of customers

# Set random seed for reproducibility
np.random.seed(42)

# Number of customers
n_customers = 2000

# Generate Covariates (X)
age = np.random.randint(18, 70, size=n_customers)
income = np.random.normal(60000, 20000, size=n_customers)
previous_purchases = np.random.choice([0, 1, 2, 3, 4, 5], size=n_customers)
gender = np.random.choice(['Male', 'Female'], size=n_customers)

# Encode gender as a numeric variable
gender_encoded = np.where(gender == 'Male', 1, 0)

# Generate Treatment (T)
email_open_prob = 1 / (1 + np.exp(-0.05 * (age - 40) + 0.00002 * income + 0.4 * previous_purchases))
email_opened = np.random.binomial(1, email_open_prob)

# Generate Outcome (Y)
purchase_amount = (300 + 2.5 * age + 0.05 * income + 50 * previous_purchases + 
                   200 * email_opened + np.random.normal(0, 100, size=n_customers))

# Compile the dataset
df = pd.DataFrame({
    'age': age,
    'income': income,
    'previous_purchases': previous_purchases,
    'gender': gender_encoded,
    'email_opened': email_opened,
    'purchase_amount': purchase_amount
})

class NoiseGenerator:
    def __init__(self, n_customers):
    ##TODO: Sometimes magnitude needs to be a fraction, would be better in these instances to rename
        self.n_customers = n_customers
    def add_gaussian_noise(self,df, col_name, noise_min, noise_max):
        return df[col_name] + np.random.normal(noise_min, noise_max, size=self.n_customers)

    def add_errors(df, col_name, magnitude, ):
        ##TODO: Magnitutde has to be a fraction
        if not (0 <= magnitude <= 1):
            raise ValueError("Magnitude must be a fraction between 0 and 1.")
        
        errors = df[col_name].copy()
        n_errors = int(magnitude * len(errors))
        error_indices = np.random.choice(df.index, size=n_errors, replace=False)

        if df[col_name].dtype == 'int' and set(df[col_name].unique()) == {0, 1}:
            errors[error_indices] = 1 - errors[error_indices]
        else:
            raise ValueError("This function currently only supports binary (0/1) columns.")
        
        return errors

    def add_missing_values(df, col_name, magnitude):

        if not (0 <= magnitude <= 1):
            raise ValueError("Magnitude must be a fraction between 0 and 1.")
        
        df.loc[df.sample(frac=magnitude).index, col_name] = np.nan
        return df[col_name]

    def introduce_bias(df, pred_col, inf_col, threshold, magnitude):
        if not (df[inf_col].min() <= threshold <= df[inf_col].max()):
            raise ValueError(f"Threshold {threshold} is out of bounds for column {inf_col}.")
        
        errors = df[pred_col].copy()
        errors[df[inf_col] > threshold] += magnitude

        return errors
    
    def add_outliers(df, col_name, magnitude, size_multiplier):
        ##TODO:Add something that calculates the magnitude automatically?
        if not (0 <= size_multiplier <= 1):
            raise ValueError("Size multiplier must be a fraction between 0 and 1.")
        
        outlier_indices = np.random.choice(df.index, size=int(size_multiplier * len(df)), replace=False)
        errors = df[col_name].copy() 
        errors.loc[outlier_indices] *= magnitude

        return errors

    def error_treatment_assignment(df, col_name, treatment_error_rate):
        treatment_errors = df[col_name].copy()
        treatment_error_indices = np.random.choice(df.index, size=int(treatment_error_rate * len(treatment_errors)), replace=False)
        treatment_errors[treatment_error_indices] = 1 - treatment_errors[treatment_error_indices]

        return treatment_errors

In [12]:
df

Unnamed: 0,age,income,previous_purchases,gender,email_opened,purchase_amount,income_noisy,gender_noisy,income_biased,email_opened_noisy
0,56,47000.535763,4.0,1,0,3087.317895,60269.160140,1,57000.535763,0
1,69,78744.472431,3.0,0,0,4467.301725,84691.563853,0,88744.472431,0
2,46,56767.485091,2.0,1,1,3486.547355,52627.349195,1,56767.485091,1
3,32,33174.506580,5.0,0,0,2236.244046,36758.034991,0,33174.506580,0
4,60,31477.489347,2.0,1,1,2140.518046,24397.923699,1,41477.489347,1
...,...,...,...,...,...,...,...,...,...,...
1995,63,96443.378988,0.0,0,0,5128.867512,97460.930790,0,106443.378988,0
1996,67,27946.477103,4.0,0,0,2164.111602,30079.630553,0,37946.477103,0
1997,69,35688.040081,0.0,0,0,2302.001741,40015.833909,0,45688.040081,0
1998,24,60861.660160,3.0,0,0,3407.303869,58703.056750,1,60861.660160,0
