In [1]:
from faker import Faker
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Create list of locales to account for the diversity of names in Berlin in faker (not representative)
# Restriction to Latin alphabet for simplicity (note that this leads to underrepresentation)
# Create an object for the Faker library 
locales = [
    'de_DE',   # German (Germany)
    'tr_TR',   # Turkish (Turkey)
    'en_GB',   # English (United Kingdom)
    'pl_PL',   # Polish (Poland)
    'it_IT',   # Italian (Italy)
    'fr_FR',   # French (France)
    'es_ES',   # Spanish (Spain)
    'pt_PT'    # Portuguese (Portugal)
]
# Representation of different locales for generating representative name set (guess)
probabilities = [0.6, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]

# Initialize Faker with custom probabilities
fake = Faker(random_choices=(locales, probabilities))
fake = Faker(locales)

In [3]:
users = 800

In [4]:
# Initalize lists to store the data
First_Name = []
EMail = []
Age = []
Gender = []
Location_Preferences = []
Latitude = []
Longitude = []
Country = []
City = []
Languages_Preferred = []
Last_Seen = []
Response_Time = []

In [5]:
# Create realistic e-mail providers including German providers
email_providers = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com',
    'icloud.com', 'zoho.com', 'protonmail.com', 'mail.com', 'gmx.de',
    'web.de', 't-online.de', 'freenet.de', '1und1.de', 'arcor.de', 'unitybox.de'
]


In [6]:
# Generate 800 fake names and corresponding email addresses
for _ in range(users):
    first_names = fake.first_name()
    last_names = fake.last_name()
    email = f"{first_names.lower()}.{last_names.lower()}@{fake.random_element(email_providers)}"
    First_Name.append(first_names)
    EMail.append(email)

In [7]:
# Generate "Ages" 
# Filter out values below 20
mean = 27
std_dev = 2
min_value = 20
max_value = 65

while len(Age) < users:
    age = np.random.normal(mean, std_dev)
    if min_value <= age <= max_value and not np.isnan(age):
        Age.append(int(age))

Age = [age for age in Age if age >= 20]



In [8]:
# Generate "Genders"
# Randomly choose a gender from "Male", "Female", "Diverse" and "Prefer not to say"

gender_probabilities = {
    "Male": 0.4,
    "Female": 0.4,
    "Diverse": 0.1,
    "Prefer not to say": 0.1
}

count = 0
while count < users:
    genders = random.choices(list(gender_probabilities.keys()), weights=list(gender_probabilities.values()))[0]
    Gender.append(genders)
    count += 1
    if count >= users:
        break

Gender = Gender[:users]

In [9]:
# Generate "Location Preferences"
Location_Preferences = []

# Define probabilities for each preference
preference_probabilities = {
    "Local Only": 0.1,
    "Remote Only": 0.3,
    "Local or Remote": 0.6
}

# Generate Location Preferences
count = 0
while count < users:
    preference = random.choices(list(preference_probabilities.keys()), weights=list(preference_probabilities.values()))[0]
    Location_Preferences.append(preference)
    count += 1
    if count >= users:
        break

Location_Preferences = Location_Preferences[:users]


In [10]:
# Generate "Coordinates" within Berlin
# Berlin Borders
# West: 13.0883
# East: 13.7612
# North: 52.6755
# South: 52.3385

# Define function to generate random coordinates within Berlin
# with random latitude and longitude

def generate_random_coordinates():
    min_longitude = 13.0883
    max_longitude = 13.7612
    min_latitude = 52.3385
    max_latitude = 52.6755
    
    latitude = random.uniform(min_latitude, max_latitude)
    longitude = random.uniform(min_longitude, max_longitude)
    
    return latitude, longitude

In [11]:
# Generate 800 random coordinates in Berlin
for _ in range(users):
    latitude, longitude = generate_random_coordinates()
    Latitude.append(latitude)
    Longitude.append(longitude)
    
print(Latitude)
print(Longitude)

[52.434681118411945, 52.54069805627097, 52.41589837245669, 52.468768062334085, 52.37483297536614, 52.390638597028385, 52.352691156451904, 52.34590961017228, 52.643342453049854, 52.40571195946874, 52.63560685989832, 52.597342272944005, 52.42993151816981, 52.4707001142194, 52.57353892868263, 52.381984138859714, 52.37838914275136, 52.355638770626285, 52.366578868518324, 52.44077593951681, 52.46307841102323, 52.41382607698789, 52.523945036510575, 52.52453129739231, 52.5220775112723, 52.58643003678288, 52.62402197007457, 52.58879033207179, 52.44762145161107, 52.67239041690637, 52.56472215445694, 52.47881530044347, 52.515579047414505, 52.47618421402547, 52.636114867351836, 52.60089905700991, 52.43088242839768, 52.60770402101076, 52.417059910966906, 52.51676914768199, 52.4967031100121, 52.35239122893485, 52.37429034587512, 52.593563451125235, 52.35391351992194, 52.51528844598568, 52.50949125837761, 52.41563813634923, 52.5051446410747, 52.3960019028648, 52.65551565687497, 52.39899288829792, 52

In [12]:
# Generate "Country" and "City" (Only Germany and Berlin)
for _ in range(users):
    Country.append("Germany")
    City.append("Berlin")

In [13]:
# Generate "preferred languages" while you can't select more than 5 languages

# Some of the most spoken languages globally and in Europe
most_spoken_languages = [
    "Mandarin Chinese",
    "Spanish",
    "English",
    "Hindi",
    "Arabic",
    "Bengali",
    "Portuguese",
    "Russian",
    "Japanese",
    "Punjabi",
    "German",
    "French",
    "Italian",
    "Polish",
    "Ukrainian",
    "Romanian",
    "Turkish",
]

def generate_languages_spoken():
    num_languages = random.randint(1, 5)  
    languages_spoken = random.sample(most_spoken_languages, min(num_languages, len(most_spoken_languages)))
    return languages_spoken

for _ in range(users):
    Languages_Preferred.append(generate_languages_spoken())


In [14]:
# Create "Last Seen" Data

# Define possible answers
# and probabilities for each answer
answer_probabilities = {
    "Recently": 0.1,
    "Today": 0.1,
    "Yesterday": 0.1,
    "Within a Week": 0.5,  
    "Within a Month": 0.15,
    "Over a Month Ago": 0.05
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Last_Seen.append(answer)
    count += 1
    if count >= users:
        break
    
Last_Seen = Last_Seen[:users]


In [15]:
# Create "Response Time" Data

# Define possible answers
# and probabilities for each answer
answer_probabilities = {
    "Within 24 hours": 0.2,
    "Within a week": 0.5,  
    "Within a month": 0.2,
    "More than a month": 0.1
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Response_Time.append(answer)
    count += 1
    if count >= users:
        break

Response_Time = Response_Time[:users]



In [16]:
# Create Data frame with all the information from above

table = {
    'First_Name': First_Name,
    'EMail': EMail,
    'Age': Age,
    'Gender': Gender,
    'Location_Preferences' : Location_Preferences,
    'Latitude' : Latitude,
    'Longitude' : Longitude,
    'Country': Country,
    'City': City,
    'Languages_Preferred': Languages_Preferred,
    'Last_Seen': Last_Seen,
    'Response_Time': Response_Time
}

Personal_Info = pd.DataFrame(table)
Personal_Info.head(10)

Unnamed: 0,First_Name,EMail,Age,Gender,Location_Preferences,Latitude,Longitude,Country,City,Languages_Preferred,Last_Seen,Response_Time
0,Rigo,rigo.ertaş@t-online.de,32,Female,Local or Remote,52.434681,13.580691,Germany,Berlin,[Punjabi],Recently,Within a month
1,Olivia,olivia.powell@1und1.de,25,Diverse,Local or Remote,52.540698,13.685628,Germany,Berlin,[Japanese],Within a Month,Within a week
2,Güçlüer,güçlüer.piwowarek@protonmail.com,27,Female,Remote Only,52.415898,13.248988,Germany,Berlin,"[Punjabi, Japanese, Romanian]",Within a Week,More than a month
3,Aurelia,aurelia.tavares@arcor.de,27,Female,Local or Remote,52.468768,13.330759,Germany,Berlin,"[Hindi, Japanese]",Within a Week,Within a week
4,Urbano,urbano.gülen@zoho.com,29,Male,Local or Remote,52.374833,13.718741,Germany,Berlin,[Hindi],Within a Week,Within a week
5,Göknur,göknur.bermudez@gmx.de,26,Female,Remote Only,52.390639,13.298867,Germany,Berlin,"[Turkish, German]",Today,Within a week
6,Radosław,radosław.kwapisiewicz@yahoo.com,29,Male,Local or Remote,52.352691,13.383603,Germany,Berlin,"[Punjabi, Hindi, Spanish, Russian, Bengali]",Today,Within a week
7,Daniel,daniel.turati@zoho.com,28,Prefer not to say,Local or Remote,52.34591,13.104398,Germany,Berlin,"[Turkish, Punjabi]",Within a Month,Within a week
8,Ludovico,ludovico.dzierwa@1und1.de,30,Female,Local or Remote,52.643342,13.150619,Germany,Berlin,"[Russian, Polish, Hindi, Turkish]",Today,Within a month
9,Sonia,sonia.walker@yahoo.com,25,Female,Local or Remote,52.405712,13.277239,Germany,Berlin,"[Spanish, Japanese]",Within a Month,Within a week


In [17]:
Personal_Info.to_csv('Personal_Info.csv', index=False)