B. Model Based Synthesis

In [13]:
pip install Faker

Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
from faker import Faker
import random
import numpy as np

fake = Faker()

def generate_country():
    countries = ['USA', 'India', 'China', 'Canada', 'South Korea', 'Philippines', 'Taiwan', 'Mexico']
    country_distribution = [0.55, 0.745, 0.118, 0.01, 0.009, 0.006, 0.006, 0.002]
    return random.choices(countries, weights=country_distribution, k=1)[0]

def generate_department():
    departments = ['Product', 'Human Resource', 'Legal', 'Marketing', 'Administrative', 'Operations', 'Sales', 'Finance', 'I/T']
    department_distribution = [0.20, 0.10, 0.05, 0.10, 0.10, 0.20, 0.10, 0.05, 0.10]
    return random.choices(departments, weights=department_distribution, k=1)[0]

def generate_salary(department):
    salary_ranges = {
        'Product': (7000, 14000),
        'Human Resource': (5000, 10000),
        'Legal': (8000, 15000),
        'Marketing': (6000, 12000),
        'Administrative': (4000, 8000),
        'Operations': (5000, 10000),
        'Sales': (6000, 12000),
        'Finance': (7000, 14000),
        'I/T': (7000, 14000)
    }
    return random.randint(*salary_ranges[department])

num_employees = 10000
data = []

for _ in range(num_employees):
    gender = random.choices(['male', 'female'], k=1)[0]
    first_name = fake.first_name_male() if gender == 'male' else fake.first_name_female()
    last_name = fake.last_name()
    email = fake.email()
    phone = fake.phone_number()
    age = random.randint(22, 65)
    job_title = fake.job()
    years_of_experience = random.randint(0, 15)
    department = generate_department()
    salary = generate_salary(department)
    ssn = fake.ssn()
    languages = random.randint(0, 2)
    country = generate_country()
    data.append([first_name, last_name, email, phone, gender, age, job_title, years_of_experience, salary, department, ssn, languages, country])

columns = ['First Name', 'Last Name', 'Email', 'Phone', 'Gender', 'Age', 'Job Title', 'Years Of Experience', 'Salary', 'Department', 'SSN', 'Languages', 'Country']
df_synthetic = pd.DataFrame(data, columns=columns)
#synthetic data
df_synthetic.to_csv('synthetic_employees.csv', index=False)

C. Analyze the Synthetic Company

In [15]:
import pandas as pd

#Load the data
df = pd.read_csv("synthetic_employees.csv")
gender = df.groupby(['Department', 'Gender']).size().unstack(fill_value=0)

print("Men vs. Women in each department:")
print(gender)

Men vs. Women in each department:
Gender          female  male
Department                  
Administrative     461   468
Finance            255   258
Human Resource     526   547
I/T                508   486
Legal              251   243
Marketing          538   487
Operations         935  1020
Product           1042  1006
Sales              481   488


In [16]:
payroll = df['Salary'].sum()
print("Yearly payroll:")
print(payroll)

Yearly payroll:
89268054


In [17]:
employees = len(df)
space = 20 
totalspace = employees * space

print("Total office space required (square feet):")
print(totalspace)

Total office space required (square feet):
200000


In [18]:
df_original = pd.read_csv("C:/Users/sriva/Downloads/employees.csv")

overlap_columns = ['First Name', 'Last Name', 'Email', 'Phone']
overlaps = pd.merge(df_original, df, on=overlap_columns, how='inner')

print("Number of overlapping records between datasets:")
print(len(overlaps))


Number of overlapping records between datasets:
0


D. Quality of the Synthetic Dataset

In [19]:
pip install ydata-profiling

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
from ydata_profiling import ProfileReport

df_existing = pd.read_csv("C:/Users/sriva/Downloads/employees.csv")
df_synthetic = pd.read_csv("synthetic_employees.csv")

profile_existing = ProfileReport(df_existing, title="Original Employees Dataset Profile", explorative=True, correlations={"auto": {"calculate": False}})
profile_synthetic = ProfileReport(df_synthetic, title="Synthetic Employees Dataset Profile", explorative=True, correlations={"auto": {"calculate": False}})

#Reports
profile_existing.to_file("original_employees_profile.html")
profile_synthetic.to_file("synthetic_employees_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

E. Sampling

In [21]:
df_synthetic = pd.read_csv("synthetic_employees.csv")

def calculate_weights(df, age_column):
    weights = pd.Series([1] * len(df), index=df.index)
    weights[df[age_column].between(40, 49)] = 3
    weights = weights / weights.sum()
    
    return weights
    
weights = calculate_weights(df_synthetic, 'Age')

sampled_df = df_synthetic.sample(n=20, weights=weights, random_state=1)

print(sampled_df)

     First Name  Last Name                        Email  \
4180      Tonya      Brown        dyersusan@example.com   
7219   Kimberly     Taylor       clarence22@example.com   
0        Ronald     Bowers          emiller@example.com   
3046    Matthew      Price         amanda68@example.com   
1486      Jason     Rogers        shannon38@example.org   
947     Matthew  Cervantes  jocelynerickson@example.org   
1862       John    Clayton        cherylliu@example.org   
3490       Todd     Rangel          khoover@example.com   
3983     Brandy    Maxwell         ojohnson@example.org   
5392     Pamela       Pope         eugene35@example.com   
4204   Victoria     Miller           vsmith@example.net   
6864    Melanie    Griffin          daisy59@example.org   
2054    Ricardo     Hudson        suzanne37@example.net   
8774       John       Carr       nicholas06@example.com   
286    Danielle   Williams            joy50@example.net   
6719       John   Gonzalez          james23@example.com 

F. Anonymization

In [22]:

fake = Faker()

#Load the data
df_existing = pd.read_csv('C:/Users/sriva/Downloads/employees.csv')

def anonymize_data(df):
    for i in range(len(df)):
        df.at[i, 'First Name'] = f'Fistname_{i}'
        df.at[i, 'Last Name'] = f'Lastname_{i}'
        df.at[i, 'Email'] = f'FistnameLastname{i}@example.com'
        df.at[i, 'Phone'] = fake.phone_number()
    return df

df_anonymized = anonymize_data(df_existing)
print(df_anonymized.head())

   First Name   Last Name                          Email  \
0  Fistname_0  Lastname_0  FistnameLastname0@example.com   
1  Fistname_1  Lastname_1  FistnameLastname1@example.com   
2  Fistname_2  Lastname_2  FistnameLastname2@example.com   
3  Fistname_3  Lastname_3  FistnameLastname3@example.com   
4  Fistname_4  Lastname_4  FistnameLastname4@example.com   

                   Phone  Gender  Age                  Job Title  \
0           509.909.8807    male   25            Project Manager   
1           622-432-8058  female   26  Machine Learning Engineer   
2     (738)901-0459x7999    male   37            Project Manager   
3          (821)529-0396  female   31              Web Developer   
4  +1-920-512-0556x31639    male   35            Project Manager   

   Years Of Experience  Salary Department  
0                    1    8500    Product  
1                    2    7000    Product  
2                   14   17000    Product  
3                    8   10000    Product  
4         

G. Perturbation

In [23]:
print(df_existing['Age'].max())
print(df_existing['Age'].min())
print(0.25 * df_existing['Salary'].mean())

41
23
2543.75


In [24]:
def perturb_data(df):
    age_std =  0.15 * (df['Age'].max() - df['Age'].min())  
    salary_std = 0.25 * df['Salary'].mean() 
    experience_std = 0.1 * (df['Years Of Experience'].max() - df['Years Of Experience'].min())  
    
    df['Age'] = round(df['Age'] + np.random.normal(0, age_std, df['Age'].shape))
    df['Salary'] = df['Salary'] + np.random.normal(0, salary_std, df['Salary'].shape)
    df['Years Of Experience'] = df['Years Of Experience'] + np.random.normal(0, experience_std, df['Years Of Experience'].shape)
    
    # Ensuring no negative values for age, salary, and years of experience
    df['Age'] = df['Age'].clip(lower=0)
    df['Salary'] = df['Salary'].clip(lower=0)
    df['Years Of Experience'] = df['Years Of Experience'].clip(lower=0)
    
    return df

df_perturbed = perturb_data(df_existing)
print(df_perturbed.head())

   First Name   Last Name                          Email  \
0  Fistname_0  Lastname_0  FistnameLastname0@example.com   
1  Fistname_1  Lastname_1  FistnameLastname1@example.com   
2  Fistname_2  Lastname_2  FistnameLastname2@example.com   
3  Fistname_3  Lastname_3  FistnameLastname3@example.com   
4  Fistname_4  Lastname_4  FistnameLastname4@example.com   

                   Phone  Gender   Age                  Job Title  \
0           509.909.8807    male  27.0            Project Manager   
1           622-432-8058  female  28.0  Machine Learning Engineer   
2     (738)901-0459x7999    male  41.0            Project Manager   
3          (821)529-0396  female  36.0              Web Developer   
4  +1-920-512-0556x31639    male  35.0            Project Manager   

   Years Of Experience        Salary Department  
0             1.634338   6088.941547    Product  
1             0.381747   4574.671011    Product  
2            14.051242  14869.313852    Product  
3            10.357624  