In [1]:
from faker import Faker
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Create list of locales to account for the diversity of names in Berlin in faker (not representative)
# Restriction to Latin alphabet for simplicity 
locales = [
    'de_DE',   # German (Germany)
    'tr_TR',   # Turkish (Turkey)
    'en_GB',   # English (United Kingdom)
    'pl_PL',   # Polish (Poland)
    'it_IT',   # Italian (Italy)
    'fr_FR',   # French (France)
    'es_ES',   # Spanish (Spain)
    'pt_PT'    # Portuguese (Portugal)
]
# Representation of different locales for generating representative name set (guess)
probabilities = [0.6, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]

# Initialize faker with custom probabilities
fake = Faker(random_choices=(locales, probabilities))
fake = Faker(locales)

In [3]:
users = 800

In [4]:
# Initalize lists to store the data
First_Name = []
EMail = []
Age = []
Gender = []
Location_Preferences = []
Latitude = []
Longitude = []
Country = []
City = []
Languages_Preferred = []
Last_Seen = []
Response_Time = []

In [5]:
# Create realistic e-mail providers including German providers
email_providers = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com',
    'icloud.com', 'zoho.com', 'protonmail.com', 'mail.com', 'gmx.de',
    'web.de', 't-online.de', 'freenet.de', '1und1.de', 'arcor.de', 'unitybox.de'
]


In [6]:
# Generate 800 fake names and corresponding email addresses
for _ in range(users):
    first_names = fake.first_name()
    last_names = fake.last_name()
    email = f"{first_names.lower()}.{last_names.lower()}@{fake.random_element(email_providers)}"
    First_Name.append(first_names)
    EMail.append(email)

In [7]:
# Generate "Ages" 
# Filter out values below 20
mean = 27
std_dev = 2
min_value = 20
max_value = 65

while len(Age) < users:
    age = np.random.normal(mean, std_dev)
    if min_value <= age <= max_value and not np.isnan(age):
        Age.append(int(age))

Age = [age for age in Age if age >= 20]



In [8]:
# Generate "Genders"
# Randomly choose a gender from "Male", "Female", "Diverse" and "Prefer not to say"

gender_probabilities = {
    "Male": 0.4,
    "Female": 0.4,
    "Diverse": 0.1,
    "Prefer not to say": 0.1
}

count = 0
while count < users:
    genders = random.choices(list(gender_probabilities.keys()), weights=list(gender_probabilities.values()))[0]
    Gender.append(genders)
    count += 1
    if count >= users:
        break

Gender = Gender[:users]

In [9]:
# Generate "Location Preferences"
Location_Preferences = []

# Define probabilities for each preference
preference_probabilities = {
    "Local Only": 0.1,
    "Remote Only": 0.3,
    "Local or Remote": 0.6
}

# Generate Location Preferences
count = 0
while count < users:
    preference = random.choices(list(preference_probabilities.keys()), weights=list(preference_probabilities.values()))[0]
    Location_Preferences.append(preference)
    count += 1
    if count >= users:
        break

Location_Preferences = Location_Preferences[:users]


In [10]:
# Generate "Coordinates" within Berlin
# Berlin Borders
# West: 13.0883
# East: 13.7612
# North: 52.6755
# South: 52.3385

# Define function to generate random coordinates within Berlin
# with random latitude and longitude

def generate_random_coordinates():
    min_longitude = 13.0883
    max_longitude = 13.7612
    min_latitude = 52.3385
    max_latitude = 52.6755
    
    latitude = random.uniform(min_latitude, max_latitude)
    longitude = random.uniform(min_longitude, max_longitude)
    
    return latitude, longitude

In [11]:
# Generate 800 random coordinates in Berlin
for _ in range(users):
    latitude, longitude = generate_random_coordinates()
    Latitude.append(latitude)
    Longitude.append(longitude)
    
print(Latitude)
print(Longitude)

[52.548727321080975, 52.38077472701648, 52.420732772650496, 52.49765632403929, 52.36548725440916, 52.54703262082508, 52.53088704861723, 52.36135986149129, 52.50045332978895, 52.67456012349887, 52.556924165950754, 52.665011588719295, 52.420660036318175, 52.5036232570869, 52.55987476398559, 52.40850035366705, 52.35744032981194, 52.67164156530251, 52.37525940170078, 52.55396176289201, 52.41176490714188, 52.514741402839505, 52.49943354127753, 52.429692037619894, 52.62334482933526, 52.51795225519464, 52.60486159455185, 52.43146776478816, 52.42740623941393, 52.59998809176846, 52.525732103603026, 52.55790080162932, 52.386386610636244, 52.61138748955933, 52.659820388324796, 52.53835262016708, 52.57028823939651, 52.47073130569004, 52.381063540117324, 52.49359614611225, 52.51696553842061, 52.429730847611545, 52.51770361093739, 52.359552868532276, 52.41277172773549, 52.35067937419744, 52.46814529615228, 52.52290554018557, 52.585144708482744, 52.52728246683954, 52.47077366759972, 52.54068694626288

In [12]:
# Generate "Country" and "City" (Only Germany and Berlin)
for _ in range(users):
    Country.append("Germany")
    City.append("Berlin")

In [13]:
# Generate "preferred languages" while you can't select more than 5 languages

# Some of the most spoken languages globally and in Europe
most_spoken_languages = [
    "Mandarin Chinese",
    "Spanish",
    "English",
    "Hindi",
    "Arabic",
    "Bengali",
    "Portuguese",
    "Russian",
    "Japanese",
    "Punjabi",
    "German",
    "French",
    "Italian",
    "Polish",
    "Ukrainian",
    "Romanian",
    "Turkish",
]

def generate_languages_spoken():
    num_languages = random.randint(1, 5)  
    languages_spoken = random.sample(most_spoken_languages, min(num_languages, len(most_spoken_languages)))
    return languages_spoken

for _ in range(users):
    Languages_Preferred.append(generate_languages_spoken())


In [14]:
# Create "Last Seen" Data

# Define possible answers
# and probabilities for each answer
answer_probabilities = {
    "Recently": 0.1,
    "Today": 0.1,
    "Yesterday": 0.1,
    "Within a Week": 0.5,  
    "Within a Month": 0.15,
    "Over a Month Ago": 0.05
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Last_Seen.append(answer)
    count += 1
    if count >= users:
        break
    
Last_Seen = Last_Seen[:users]


In [15]:
# Create "Response Time" Data

# Define possible answers
# and probabilities for each answer
answer_probabilities = {
    "Within 24 hours": 0.2,
    "Within a week": 0.5,  
    "Within a month": 0.2,
    "More than a month": 0.1
}

count = 0
while count < users:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Response_Time.append(answer)
    count += 1
    if count >= users:
        break

Response_Time = Response_Time[:users]



In [16]:
# Create Data frame with all the information from above

table = {
    'First_Name': First_Name,
    'EMail': EMail,
    'Age': Age,
    'Gender': Gender,
    'Location_Preferences' : Location_Preferences,
    'Latitude' : Latitude,
    'Longitude' : Longitude,
    'Country': Country,
    'City': City,
    'Languages_Preferred': Languages_Preferred,
    'Last_Seen': Last_Seen,
    'Response_Time': Response_Time
}

Personal_Info = pd.DataFrame(table)
Personal_Info.head(10)

Unnamed: 0,First_Name,EMail,Age,Gender,Location_Preferences,Latitude,Longitude,Country,City,Languages_Preferred,Last_Seen,Response_Time
0,Kandef,kandef.fechner@freenet.de,28,Prefer not to say,Local Only,52.548727,13.534825,Germany,Berlin,[Bengali],Within a Week,Within a week
1,Sonia,sonia.conte@hotmail.com,25,Prefer not to say,Local or Remote,52.380775,13.596353,Germany,Berlin,"[Polish, Turkish, Japanese]",Over a Month Ago,Within a week
2,Marek,marek.gough@gmail.com,24,Male,Local or Remote,52.420733,13.722983,Germany,Berlin,[Punjabi],Within a Week,Within a week
3,Seve,seve.manjón@freenet.de,29,Female,Local or Remote,52.497656,13.244198,Germany,Berlin,"[Japanese, French]",Within a Month,Within a month
4,Bernard,bernard.hançer@unitybox.de,28,Male,Local or Remote,52.365487,13.121122,Germany,Berlin,[Romanian],Within a Week,More than a month
5,Kaja,kaja.williamson@gmx.de,29,Male,Local or Remote,52.547033,13.331416,Germany,Berlin,[Romanian],Yesterday,Within a week
6,Luigi,luigi.agustí@1und1.de,24,Prefer not to say,Local Only,52.530887,13.361834,Germany,Berlin,"[Mandarin Chinese, Turkish]",Within a Week,Within a week
7,Ewelina,ewelina.ramos@web.de,24,Female,Local or Remote,52.36136,13.341903,Germany,Berlin,"[French, Hindi, Punjabi, English, Spanish]",Today,Within a week
8,Fabiana,fabiana.gill@protonmail.com,26,Female,Local Only,52.500453,13.207391,Germany,Berlin,"[French, Polish, Russian, Turkish, Punjabi]",Today,Within a month
9,Virginie,virginie.green@protonmail.com,23,Female,Local Only,52.67456,13.139732,Germany,Berlin,"[German, Arabic, Japanese]",Within a Week,Within a month


In [17]:
Personal_Info.to_csv('Personal_Info.csv', index=False)