In [1]:
from faker import Faker
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Create list of locales to account for the diversity of names in Berlin in faker (not representative)
# Restriction to Latin alphabet for simplicity (note that this leads to underrepresentation)
# Create an object for the Faker library 
locales = [
    'de_DE',   # German (Germany)
    'tr_TR',   # Turkish (Turkey)
    'en_GB',   # English (United Kingdom)
    'pl_PL',   # Polish (Poland)
    'it_IT',   # Italian (Italy)
    'fr_FR',   # French (France)
    'es_ES',   # Spanish (Spain)
    'pt_PT'    # Portuguese (Portugal)
]
# Representation of different locales for generating representative name set (guess)
probabilities = [0.6, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]

# Initialize Faker with custom probabilities
fake = Faker(random_choices=(locales, probabilities))
fake = Faker(locales)

In [3]:
users = 10000

In [4]:
# Initalize lists to store the data
First_Name = []
EMail = []
Age = []
Gender = []
Location_Preferences = []
Latitude = []
Longitude = []
Country = []
City = []
Languages_Preferred = []
Last_Seen = []
Response_Time = []

In [5]:
# Create realistic e-mail providers including German providers
email_providers = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com',
    'icloud.com', 'zoho.com', 'protonmail.com', 'mail.com', 'gmx.de',
    'web.de', 't-online.de', 'freenet.de', '1und1.de', 'arcor.de', 'unitybox.de'
]


In [6]:
# Generate 10000 fake names and corresponding email addresses
for _ in range(users):
    first_names = fake.first_name()
    last_names = fake.last_name()
    email = f"{first_names.lower()}.{last_names.lower()}@{fake.random_element(email_providers)}"
    First_Name.append(first_names)
    EMail.append(email)

In [7]:
# Generate "Ages" 
# Filter out values below 20
mean = 27
std_dev = 2
min_value = 20
max_value = 65

while len(Age) < users:
    age = np.random.normal(mean, std_dev)
    if min_value <= age <= max_value and not np.isnan(age):
        Age.append(int(age))

Age = [age for age in Age if age >= 20]



In [8]:
# Generate "Genders"
# Randomly choose a gender from "Male", "Female", "Diverse" and "Prefer not to say"

gender_probabilities = {
    "Male": 0.4,
    "Female": 0.4,
    "Diverse": 0.1,
    "Prefer not to say": 0.1
}

count = 0
while count < users:
    genders = random.choices(list(gender_probabilities.keys()), weights=list(gender_probabilities.values()))[0]
    Gender.append(genders)
    count += 1
    if count >= users:
        break

Gender = Gender[:users]

In [9]:
# Generate "Location Preferences"

import random

# Initialize lists
Location_Preferences = []

# Define probabilities for each preference
preference_probabilities = {
    "Local Only": 0.1,
    "Remote Only": 0.3,
    "Local or Remote": 0.6
}

# Generate Location Preferences
count = 0
while count < users:
    preference = random.choices(list(preference_probabilities.keys()), weights=list(preference_probabilities.values()))[0]
    Location_Preferences.append(preference)
    count += 1
    if count >= users:
        break

Location_Preferences = Location_Preferences[:users]


In [10]:
# Generate "Coordinates" within Berlin
# Berlin Borders
# West: 13.0883
# East: 13.7612
# North: 52.6755
# South: 52.3385

# Define function to generate random coordinates within Berlin
# with random latitude and longitude

def generate_random_coordinates():
    min_longitude = 13.0883
    max_longitude = 13.7612
    min_latitude = 52.3385
    max_latitude = 52.6755
    
    latitude = random.uniform(min_latitude, max_latitude)
    longitude = random.uniform(min_longitude, max_longitude)
    
    return latitude, longitude

In [11]:
# Generate 10000 random coordinates in Berlin
for _ in range(users):
    latitude, longitude = generate_random_coordinates()
    Latitude.append(latitude)
    Longitude.append(longitude)
    
print(Latitude)
print(Longitude)

[52.51369916690988, 52.480157889703065, 52.663405481808645, 52.46516204703193, 52.65236113862064, 52.359695023241926, 52.53928478168529, 52.57619986772932, 52.37774554121296, 52.59673960786359, 52.62945992083893, 52.38574905318406, 52.483417416917376, 52.39601132813911, 52.55565940099276, 52.54279750328811, 52.556073498398455, 52.36909840193953, 52.37018969391953, 52.39554384241533, 52.509710256311806, 52.49434413205456, 52.34898707624792, 52.64416284511664, 52.34356659142475, 52.51083910818051, 52.633625857703954, 52.42738349957502, 52.505905048807115, 52.65588274568932, 52.575307357261934, 52.34050675390359, 52.588499703598195, 52.543830086266276, 52.46862731766062, 52.450318439781064, 52.53657553756542, 52.445802524777186, 52.46205103256904, 52.4489430988837, 52.46614847333792, 52.434876745151485, 52.400426788056436, 52.61517562467228, 52.48251269623344, 52.53697758080599, 52.5468081079848, 52.57964181157196, 52.55006920356583, 52.4866199298112, 52.36438623192661, 52.606811176621726

In [12]:

# Generate "Country" and "City" (Only Germany and Berlin)
for _ in range(users):
    Country.append("Germany")
    City.append("Berlin")

In [13]:
# Generate "preferred languages" while you can't select more than 5 languages
# Some of the most spoken languages globally and in Europe
# Generate a list of languages spoken
# Take random number of languages between 1 and 5
# Generate languages spoken for 10000 individuals

most_spoken_languages = [
    "Mandarin Chinese",
    "Spanish",
    "English",
    "Hindi",
    "Arabic",
    "Bengali",
    "Portuguese",
    "Russian",
    "Japanese",
    "Punjabi",
    "German",
    "French",
    "Italian",
    "Polish",
    "Ukrainian",
    "Romanian",
    "Turkish",
]

def generate_languages_spoken():
    num_languages = random.randint(1, 5)  
    languages_spoken = random.sample(most_spoken_languages, min(num_languages, len(most_spoken_languages)))
    return languages_spoken

for _ in range(users):
    Languages_Preferred.append(generate_languages_spoken())


In [14]:
# Create "Last Seen" Data
# Define possible answers
# Define probabilities for each answer
answer_probabilities = {
    "Recently": 0.1,
    "Today": 0.1,
    "Yesterday": 0.1,
    "Within a Week": 0.5,  
    "Within a Month": 0.15,
    "Over a Month Ago": 0.05
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Last_Seen.append(answer)
    count += 1
    if count >= users:
        break
    
Last_Seen = Last_Seen[:users]


In [15]:
# Create "Response Time" Data
# Define possible answers
# Generate the list "Response Time"

answer_probabilities = {
    "Within 24 hours": 0.2,
    "Within a week": 0.5,  
    "Within a month": 0.2,
    "More than a month": 0.1
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Response_Time.append(answer)
    count += 1
    if count >= users:
        break

Response_Time = Response_Time[:users]



In [18]:
# Create Data frame

table = {
    'First_Name': First_Name,
    'EMail': EMail,
    'Age': Age,
    'Gender': Gender,
    'Location_Preferences' : Location_Preferences,
    'Latitude' : Latitude,
    'Longitude' : Longitude,
    'Country': Country,
    'City': City,
    'Languages_Preferred': Languages_Preferred,
    'Last_Seen': Last_Seen,
    'Response_Time': Response_Time
}

Personal_Info = pd.DataFrame(table)
Personal_Info.head(10)

Unnamed: 0,First_Name,EMail,Age,Gender,Location_Preferences,Latitude,Longitude,Country,City,Languages_Preferred,Last_Seen,Response_Time
0,Maurice,maurice.zengin@hotmail.com,24,Prefer not to say,Local or Remote,52.513699,13.212129,Germany,Berlin,[Ukrainian],Within a Week,Within a week
1,Donatello,donatello.holt@icloud.com,30,Male,Local or Remote,52.480158,13.756927,Germany,Berlin,"[Ukrainian, German, Portuguese, Punjabi, Arabic]",Recently,Within a week
2,Gudula,gudula.lima@protonmail.com,27,Female,Remote Only,52.663405,13.213178,Germany,Berlin,"[English, German, Polish]",Within a Week,Within a week
3,Sunay,sunay.stey@unitybox.de,24,Male,Remote Only,52.465162,13.479279,Germany,Berlin,"[Turkish, Bengali, Mandarin Chinese, Polish]",Within a Week,Within a week
4,Alexandros,alexandros.eberth@protonmail.com,28,Male,Remote Only,52.652361,13.144213,Germany,Berlin,"[Japanese, Romanian]",Over a Month Ago,Within a week
5,Tomás,tomás.begum@aol.com,25,Male,Local or Remote,52.359695,13.505728,Germany,Berlin,"[English, Japanese, Romanian]",Within a Week,Within a week
6,Ellie,ellie.turnbull@yahoo.com,27,Diverse,Local or Remote,52.539285,13.30544,Germany,Berlin,"[German, French]",Within a Week,Within 24 hours
7,Donatella,donatella.rodrigues@protonmail.com,25,Female,Local or Remote,52.5762,13.321686,Germany,Berlin,"[Italian, Bengali, German, Punjabi]",Today,Within a week
8,Justin,justin.arsoy@hotmail.com,22,Female,Local or Remote,52.377746,13.565305,Germany,Berlin,"[Punjabi, Polish, English, Arabic]",Within a Week,More than a month
9,Gülcegün,gülcegün.llopis@unitybox.de,25,Female,Local or Remote,52.59674,13.494636,Germany,Berlin,"[Russian, Japanese]",Within a Month,Within a month


In [19]:
Personal_Info.to_csv('Personal_Info.csv', index=False)