In [77]:
from faker import Faker
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

In [78]:
# Create list of locales to account for the diversity of names in Berlin in faker (not representative)
# Restriction to Latin alphabet for simplicity (note that this leads to underrepresentation)
# Create an object for the Faker library 
locales = [
    'de_DE',   # German (Germany)
    'tr_TR',   # Turkish (Turkey)
    'en_GB',   # English (United Kingdom)
    'pl_PL',   # Polish (Poland)
    'it_IT',   # Italian (Italy)
    'fr_FR',   # French (France)
    'es_ES',   # Spanish (Spain)
    'pt_PT'    # Portuguese (Portugal)
]
# Representation of different locales for generating representative name set (guess)
probabilities = [0.6, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]

# Initialize Faker with custom probabilities
fake = Faker(random_choices=(locales, probabilities))
fake = Faker(locales)

In [79]:
# Initalize lists to store the data
First_Name = []
EMail = []
Age = []
Gender = []
Location_Preferences = []
Coordinate = []
Country = []
City = []
Languages_Preferred = []
Last_Seen = []
Response_Time = []

In [80]:
# Create realistic e-mail providers including German providers
email_providers = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com',
    'icloud.com', 'zoho.com', 'protonmail.com', 'mail.com', 'gmx.de',
    'web.de', 't-online.de', 'freenet.de', '1und1.de', 'arcor.de', 'unitybox.de'
]


In [81]:
# Generate 750 fake names and corresponding email addresses
for _ in range(750):
    first_names = fake.first_name()
    last_names = fake.last_name()
    email = f"{first_names.lower()}.{last_names.lower()}@{fake.random_element(email_providers)}"
    First_Name.append(first_names)
    EMail.append(email)

In [82]:
# Generate "Ages" 
# Filter out values below 20
mean = 27
std_dev = 2
min_value = 20
max_value = 65

while len(Age) < 750:
    age = np.random.normal(mean, std_dev)
    if min_value <= age <= max_value and not np.isnan(age):
        Age.append(int(age))

Age = [age for age in Age if age >= 20]



In [83]:
# Generate "Genders"
# Randomly choose a gender from "Male", "Female", "Diverse" and "Prefer not to say"

gender_probabilities = {
    "Male": 0.4,
    "Female": 0.4,
    "Diverse": 0.1,
    "Prefer not to say": 0.1
}

count = 0
while count < 750:
    genders = random.choices(list(gender_probabilities.keys()), weights=list(gender_probabilities.values()))[0]
    Gender.append(genders)
    count += 1
    if count >= 750:
        break

Gender = Gender[:750]

In [84]:
# Generate "Location Preferences"

import random

# Initialize lists
Location_Preferences = []

# Define probabilities for each preference
preference_probabilities = {
    "Local Only": 0.1,
    "Remote Only": 0.3,
    "Local or Remote": 0.6
}

# Generate Location Preferences
count = 0
while count < max_instances:
    preference = random.choices(list(preference_probabilities.keys()), weights=list(preference_probabilities.values()))[0]
    Location_Preferences.append(preference)
    count += 1
    if count >= max_instances:
        break

Location_Preferences = Location_Preferences[:750]


In [85]:
# Generate "Coordinates" within Berlin
# Berlin Borders
# West: 13.0883
# East: 13.7612
# North: 52.6755
# South: 52.3385

# Define function to generate random coordinates within Berlin
# with random latitude and longitude

def generate_random_coordinates():
    min_longitude = 13.0883
    max_longitude = 13.7612
    min_latitude = 52.3385
    max_latitude = 52.6755
    
    latitude = random.uniform(min_latitude, max_latitude)
    longitude = random.uniform(min_longitude, max_longitude)
    
    return latitude, longitude

In [86]:
# Generate 750 random coordinates in Berlin
for _ in range(750):
    latitude, longitude = generate_random_coordinates()
    Coordinate.append((latitude, longitude))

In [87]:

# Generate "Country" and "City" (Only Germany and Berlin)
for _ in range(750):
    Country.append("Germany")
    City.append("Berlin")

In [88]:
# Generate "preferred languages" while you can't select more than 5 languages
# Some of the most spoken languages globally and in Europe
# Generate a list of languages spoken
# Take random number of languages between 1 and 5
# Generate languages spoken for 750 individuals

most_spoken_languages = [
    "Mandarin Chinese",
    "Spanish",
    "English",
    "Hindi",
    "Arabic",
    "Bengali",
    "Portuguese",
    "Russian",
    "Japanese",
    "Punjabi",
    "German",
    "French",
    "Italian",
    "Polish",
    "Ukrainian",
    "Romanian",
    "Turkish",
]

def generate_languages_spoken():
    num_languages = random.randint(1, 5)  
    languages_spoken = random.sample(most_spoken_languages, min(num_languages, len(most_spoken_languages)))
    return languages_spoken

for _ in range(750):
    Languages_Preferred.append(generate_languages_spoken())


In [89]:
# Create "Last Seen" Data
# Define possible answers
# Define probabilities for each answer
answer_probabilities = {
    "Recently": 0.1,
    "Today": 0.1,
    "Yesterday": 0.1,
    "Within a Week": 0.5,  
    "Within a Month": 0.15,
    "Over a Month Ago": 0.05
}

count = 0
while count < 750:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Last_Seen.append(answer)
    count += 1
    if count >= 750:
        break
    
Last_Seen = Last_Seen[:750]


In [90]:
# Create "Response Time" Data
# Define possible answers
# Generate the list "Response Time"

answer_probabilities = {
    "Within 24 hours": 0.2,
    "Within a week": 0.5,  
    "Within a month": 0.2,
    "More than a month": 0.1
}

count = 0
while count < 750:
    answer = random.choices(list(answer_probabilities.keys()), weights=list(answer_probabilities.values()))[0]
    Response_Time.append(answer)
    count += 1
    if count >= 750:
        break

Response_Time = Response_Time[:750]



In [91]:
# Create Data frame

table = {
    'First_Name': First_Name,
    'EMail': EMail,
    'Age': Age,
    'Gender': Gender,
    'Location_Preferences' : Location_Preferences,
    'Coordinate': Coordinate,
    'Country': Country,
    'City': City,
    'Languages_Preferred': Languages_Preferred,
    'Last_Seen': Last_Seen,
    'Response_Time': Response_Time
}

Personal_Info = pd.DataFrame(table)
Personal_Info.head(10)

Unnamed: 0,First_Name,EMail,Age,Gender,Location_Preferences,Coordinate,Country,City,Languages_Preferred,Last_Seen,Response_Time
0,Avunç,avunç.santos@web.de,23,Male,Local or Remote,"(52.403010937141616, 13.536818568223497)",Germany,Berlin,"[Spanish, Mandarin Chinese, Hindi, Polish, Eng...",Within a Week,More than a month
1,Özüdoğru,özüdoğru.ossola@gmx.de,26,Prefer not to say,Remote Only,"(52.39643123462186, 13.478119345675024)",Germany,Berlin,"[Arabic, Romanian, Polish, French, German]",Within a Week,Within a week
2,Vittorio,vittorio.kamela@1und1.de,28,Male,Local or Remote,"(52.35074370232088, 13.737720404874722)",Germany,Berlin,"[German, Turkish, Bengali, Mandarin Chinese]",Over a Month Ago,Within a week
3,Tymoteusz,tymoteusz.beyer@yahoo.com,28,Prefer not to say,Local or Remote,"(52.514799224781996, 13.682851320445437)",Germany,Berlin,"[Ukrainian, Portuguese, Spanish, Turkish]",Within a Week,Within a week
4,Melissa,melissa.antunes@outlook.com,27,Prefer not to say,Local or Remote,"(52.41444021788198, 13.493164763748588)",Germany,Berlin,"[French, Turkish]",Within a Week,Within a week
5,Mohamed,mohamed.cristoforetti@protonmail.com,25,Female,Local or Remote,"(52.35638857623807, 13.10877022217662)",Germany,Berlin,[Japanese],Within a Month,Within 24 hours
6,Colette,colette.yılmaz@freenet.de,27,Male,Remote Only,"(52.62410714489154, 13.134837038852567)",Germany,Berlin,"[Mandarin Chinese, Turkish, Hindi]",Within a Week,More than a month
7,Mahsun,mahsun.sędek@yahoo.com,28,Female,Local or Remote,"(52.67301832613645, 13.097000655236885)",Germany,Berlin,"[Ukrainian, Punjabi, Portuguese, Japanese]",Yesterday,Within a month
8,Martim,martim.dörr@gmail.com,32,Prefer not to say,Local or Remote,"(52.37997167430406, 13.143094903938998)",Germany,Berlin,"[Ukrainian, Punjabi, Russian, Turkish, French]",Today,More than a month
9,Alison,alison.kaur@yahoo.com,23,Female,Local or Remote,"(52.41107838693445, 13.451840234647404)",Germany,Berlin,"[Portuguese, French]",Within a Week,Within a week


In [32]:
Personal_Info.to_csv('Personal_Info.csv', index=False)

In [33]:
# Create Overview over dtypes
# Insert the "Factor" column as the first column

column_info = []

for column, dtype in Personal_Info.dtypes.items():
    column_info.append((column, str(dtype)))
    
df_column_info = pd.DataFrame(column_info, columns=['User Data', 'Data Type'])
df_column_info['Factor'] = 'Personal_Info'
factor_column = df_column_info.pop('Factor')

df_column_info.insert(0, 'Factor', factor_column)
df_column_info.head()


Unnamed: 0,Factor,User Data,Data Type
0,Personal_Info,First_Name,object
1,Personal_Info,EMail,object
2,Personal_Info,Age,int64
3,Personal_Info,Gender,object
4,Personal_Info,Location_Preferences,object


In [34]:
df_column_info.to_csv('dtype_Personal_Info.csv', index=False)