In [1]:
from faker import Faker
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Create list of locales to account for the diversity of names in Berlin in faker (not representative)
# Restriction to Latin alphabet for simplicity (note that this leads to underrepresentation)
# Create an object for the Faker library 
locales = [
    'de_DE',   # German (Germany)
    'tr_TR',   # Turkish (Turkey)
    'en_GB',   # English (United Kingdom)
    'pl_PL',   # Polish (Poland)
    'it_IT',   # Italian (Italy)
    'fr_FR',   # French (France)
    'es_ES',   # Spanish (Spain)
    'pt_PT'    # Portuguese (Portugal)
]
# Representation of different locales for generating representative name set (guess)
probabilities = [0.6, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]

# Initialize Faker with custom probabilities
fake = Faker(random_choices=(locales, probabilities))
fake = Faker(locales)

In [3]:
users = 800

In [4]:
# Initalize lists to store the data
First_Name = []
EMail = []
Age = []
Gender = []
Location_Preferences = []
Latitude = []
Longitude = []
Country = []
City = []
Languages_Preferred = []
Last_Seen = []
Response_Time = []

In [5]:
# Create realistic e-mail providers including German providers
email_providers = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com',
    'icloud.com', 'zoho.com', 'protonmail.com', 'mail.com', 'gmx.de',
    'web.de', 't-online.de', 'freenet.de', '1und1.de', 'arcor.de', 'unitybox.de'
]


In [6]:
# Generate 800 fake names and corresponding email addresses
for _ in range(users):
    first_names = fake.first_name()
    last_names = fake.last_name()
    email = f"{first_names.lower()}.{last_names.lower()}@{fake.random_element(email_providers)}"
    First_Name.append(first_names)
    EMail.append(email)

In [7]:
# Generate "Ages" 
# Filter out values below 20
mean = 27
std_dev = 2
min_value = 20
max_value = 65

while len(Age) < users:
    age = np.random.normal(mean, std_dev)
    if min_value <= age <= max_value and not np.isnan(age):
        Age.append(int(age))

Age = [age for age in Age if age >= 20]



In [8]:
# Generate "Genders"
# Randomly choose a gender from "Male", "Female", "Diverse" and "Prefer not to say"

gender_probabilities = {
    "Male": 0.4,
    "Female": 0.4,
    "Diverse": 0.1,
    "Prefer not to say": 0.1
}

count = 0
while count < users:
    genders = random.choices(list(gender_probabilities.keys()), weights=list(gender_probabilities.values()))[0]
    Gender.append(genders)
    count += 1
    if count >= users:
        break

Gender = Gender[:users]

In [9]:
# Generate "Location Preferences"

import random

# Initialize lists
Location_Preferences = []

# Define probabilities for each preference
preference_probabilities = {
    "Local Only": 0.1,
    "Remote Only": 0.3,
    "Local or Remote": 0.6
}

# Generate Location Preferences
count = 0
while count < users:
    preference = random.choices(list(preference_probabilities.keys()), weights=list(preference_probabilities.values()))[0]
    Location_Preferences.append(preference)
    count += 1
    if count >= users:
        break

Location_Preferences = Location_Preferences[:users]


In [10]:
# Generate "Coordinates" within Berlin
# Berlin Borders
# West: 13.0883
# East: 13.7612
# North: 52.6755
# South: 52.3385

# Define function to generate random coordinates within Berlin
# with random latitude and longitude

def generate_random_coordinates():
    min_longitude = 13.0883
    max_longitude = 13.7612
    min_latitude = 52.3385
    max_latitude = 52.6755
    
    latitude = random.uniform(min_latitude, max_latitude)
    longitude = random.uniform(min_longitude, max_longitude)
    
    return latitude, longitude

In [11]:
# Generate 800 random coordinates in Berlin
for _ in range(users):
    latitude, longitude = generate_random_coordinates()
    Latitude.append(latitude)
    Longitude.append(longitude)
    
print(Latitude)
print(Longitude)

[52.42534309243814, 52.47268472760901, 52.651681397897434, 52.459891367952515, 52.56778257302199, 52.64983689281025, 52.358479198108846, 52.36655024083883, 52.46368367731973, 52.50465239953119, 52.354861958849604, 52.533408007126084, 52.57743888541839, 52.67404364731717, 52.35098085026651, 52.503834617309344, 52.60049614010025, 52.45628835168483, 52.64079483898517, 52.38930630378345, 52.408822945703136, 52.655549998748626, 52.382875426934376, 52.51881725032383, 52.61356973311157, 52.568683390993264, 52.510029894683335, 52.60241338902146, 52.44175405363282, 52.5507349657738, 52.411689520177795, 52.64511160058589, 52.61436145240055, 52.35800976341392, 52.47658934606744, 52.61122666762688, 52.47401774251768, 52.58444825843217, 52.42440432059905, 52.614764696172855, 52.51173245344564, 52.413192341617396, 52.46571864141764, 52.66365306121321, 52.55945110533602, 52.59648129358786, 52.44708675745218, 52.60328458780082, 52.55476243760039, 52.665379140320994, 52.67292186360696, 52.5222734862855

In [12]:

# Generate "Country" and "City" (Only Germany and Berlin)
for _ in range(users):
    Country.append("Germany")
    City.append("Berlin")

In [13]:
# Generate "preferred languages" while you can't select more than 5 languages
# Some of the most spoken languages globally and in Europe
# Generate a list of languages spoken
# Take random number of languages between 1 and 5
# Generate languages spoken for 800 individuals

most_spoken_languages = [
    "Mandarin Chinese",
    "Spanish",
    "English",
    "Hindi",
    "Arabic",
    "Bengali",
    "Portuguese",
    "Russian",
    "Japanese",
    "Punjabi",
    "German",
    "French",
    "Italian",
    "Polish",
    "Ukrainian",
    "Romanian",
    "Turkish",
]

def generate_languages_spoken():
    num_languages = random.randint(1, 5)  
    languages_spoken = random.sample(most_spoken_languages, min(num_languages, len(most_spoken_languages)))
    return languages_spoken

for _ in range(users):
    Languages_Preferred.append(generate_languages_spoken())


In [14]:
# Create "Last Seen" Data
# Define possible answers
# Define probabilities for each answer
answer_probabilities = {
    "Recently": 0.1,
    "Today": 0.1,
    "Yesterday": 0.1,
    "Within a Week": 0.5,  
    "Within a Month": 0.15,
    "Over a Month Ago": 0.05
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Last_Seen.append(answer)
    count += 1
    if count >= users:
        break
    
Last_Seen = Last_Seen[:users]


In [15]:
# Create "Response Time" Data
# Define possible answers
# Generate the list "Response Time"

answer_probabilities = {
    "Within 24 hours": 0.2,
    "Within a week": 0.5,  
    "Within a month": 0.2,
    "More than a month": 0.1
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Response_Time.append(answer)
    count += 1
    if count >= users:
        break

Response_Time = Response_Time[:users]



In [16]:
# Create Data frame

table = {
    'First_Name': First_Name,
    'EMail': EMail,
    'Age': Age,
    'Gender': Gender,
    'Location_Preferences' : Location_Preferences,
    'Latitude' : Latitude,
    'Longitude' : Longitude,
    'Country': Country,
    'City': City,
    'Languages_Preferred': Languages_Preferred,
    'Last_Seen': Last_Seen,
    'Response_Time': Response_Time
}

Personal_Info = pd.DataFrame(table)
Personal_Info.head(10)

Unnamed: 0,First_Name,EMail,Age,Gender,Location_Preferences,Latitude,Longitude,Country,City,Languages_Preferred,Last_Seen,Response_Time
0,Tymon,tymon.giolitti@1und1.de,26,Male,Local or Remote,52.425343,13.695938,Germany,Berlin,"[Punjabi, Polish]",Within a Week,Within a week
1,Benito,benito.guichard@gmail.com,23,Female,Local or Remote,52.472685,13.562493,Germany,Berlin,"[Portuguese, Arabic]",Within a Month,Within a week
2,Eryk,eryk.renault@t-online.de,30,Female,Local or Remote,52.651681,13.713052,Germany,Berlin,[French],Today,Within 24 hours
3,Graeme,graeme.pizziol@protonmail.com,30,Female,Local or Remote,52.459891,13.192676,Germany,Berlin,[Mandarin Chinese],Yesterday,More than a month
4,Antonino,antonino.miranda@protonmail.com,25,Male,Local or Remote,52.567783,13.705778,Germany,Berlin,"[Arabic, Spanish, Punjabi, Turkish]",Recently,Within a week
5,Gerald,gerald.oleszko@outlook.com,26,Female,Local or Remote,52.649837,13.291504,Germany,Berlin,"[Arabic, Turkish, Portuguese, Japanese, Bengali]",Within a Month,More than a month
6,Filipe,filipe.zorlu@yahoo.com,25,Female,Local or Remote,52.358479,13.480191,Germany,Berlin,"[German, Portuguese, Hindi]",Within a Month,Within a week
7,Valeria,valeria.paiva@gmail.com,25,Female,Local or Remote,52.36655,13.608009,Germany,Berlin,[Russian],Today,Within 24 hours
8,Elgin,elgin.blin@gmx.de,29,Female,Local or Remote,52.463684,13.6011,Germany,Berlin,[Portuguese],Within a Week,Within 24 hours
9,John,john.sousa@aol.com,28,Female,Local or Remote,52.504652,13.194376,Germany,Berlin,"[German, Arabic, Punjabi, Polish]",Within a Week,Within a week


In [17]:
Personal_Info.to_csv('Personal_Info.csv', index=False)