In [None]:
import pandas as pd
import numpy as np
import random
pd.set_option("display.max_columns", 60)

In [None]:
columns = ["User_ID", "Geschlecht", "Gewicht", "Größe", "Augenfarbe", "Haarfarbe", "Alter", 'Aktivität', 'Chance', "Infiziert"]
length = 100000
df = pd.DataFrame(columns=columns)

In [None]:
def set_column(values, weights, count = None):
    if count == None:
        count = length
    return np.random.choice(values, size=count, p=weights)

def set_random_number(start, end, count):
    return np.round(np.random.uniform(start, end, count), 2)
    #return start + np.random.sample(count) * end
    
def set_distribution_column(mean, std, count, astype = 'int32'):
    res = np.random.normal(mean, std, count)
    if astype == 'float':
        res = np.round(res, 2)
    else:
        res = np.round(res)
        
    res = res.astype(astype)  
    return res

def set_random_corona_date(count):
    res = []
    
    for i in range(0, count):
        day = np.random.randint(1, 20)
        month = np.random.randint(1, 13)
        res.append(str(day) + '.' + str(month) + '.2020')
    return res

def _apply_offset_date(row, rel, mean, std):
    split = row[rel].split('.')
    
    res = np.random.normal(mean, std)
    res = np.round(res).astype('int32')
    
    split[0] = str(int(split[0]) + res)
    
    return '.'.join(split)

def _apply_infiziert(row):
    prob = (row['Aktivität'] / 5.0 + row['Chance'] / 5.0) / 2
    if prob > 0.5:
        return '1'
    else:
        return '0'
    weights = [prob, 1 - prob]
    print(weights)
    return np.random.choice(['1', '0'], p=weights)

In [None]:
diff_length = True
entriesPerClient = 100
lIDs = []
clients = int(length/entriesPerClient)

if diff_length:
    tmp = [[i] * np.random.choice(entriesPerClient) for i in range(0, length)]
else:
    tmp = [[i] * entriesPerClient for i in range(0, clients)]

for row in tmp:
    lIDs += row

if diff_length:    
    random.shuffle(lIDs)
    lIDs = lIDs[:length]
    lIDs = sorted(lIDs)
    if len(lIDs) == length:
        df['User_ID'] = lIDs      
else:
    df['User_ID'] = lIDs

### Create "Infected"-Dataset

In [None]:
df['Geschlecht'] = set_column([0, 1], [0.5, 0.5]) # M = 0, #W= 1

In [None]:
df['Augenfarbe'] = set_column([0, 1, 2, 3], [7/100, 85/100, 7/100, 1/100]) #'Blau', 'Braun', 'Grün', 'Grau'

In [None]:
df['Haarfarbe'] = set_column([0, 1, 2, 3], [80/100, 10/100, 8/100, 2/100])

In [None]:
df['Alter'] = set_distribution_column(50, 12, length, 'int32')

In [None]:
df.loc[df['Geschlecht'] == 1, 'Gewicht'] = set_distribution_column(80, 10, len(df.loc[df['Geschlecht'] == 1]), 'int32')
df.loc[df['Geschlecht'] == 0, 'Gewicht'] = set_distribution_column(100, 15, len(df.loc[df['Geschlecht'] == 0]), 'int32')

In [None]:
df.loc[df['Geschlecht'] == 1, 'Größe'] = set_distribution_column(173, 3, len(df.loc[df['Geschlecht'] == 1]), 'int32')
df.loc[df['Geschlecht'] == 0, 'Größe'] = set_distribution_column(185, 6, len(df.loc[df['Geschlecht'] == 0]), 'int32')

In [None]:
df['Aktivität'] = set_distribution_column(2.5, 0.5, length, 'float')
df['Chance'] = set_random_number(0, 5, length)

In [None]:
df['Infiziert'] = df.apply(lambda row: _apply_infiziert(row), axis=1)

In [None]:
df.info

In [None]:
df.to_csv("./" + str(clients) + "_infected_shuffled.csv", index=False)

### Create random datasets

In [None]:
columns = ["User_ID", "R1", "R2", "R3", "R4", "R6", "R7", "R8"]
df_r = pd.DataFrame(columns=columns)
df_r["User_ID"] = lIDs
mu, sigma = 0, 0.1 
for col in columns[1:-1]:
    df_r[col] = set_distribution_column(mu, sigma, length, astype="float")

df_r[columns[-1]] = set_distribution_column(100, 50, length)
df_r.to_csv("./data/normal_dist.csv", index=False)
print(len(df_r["R8"].unique()))

In [None]:
columns = ["User_ID", "R1", "R2", "R3", "R4", "R6", "R7", "R8"]
df_r = pd.DataFrame(columns=columns)
df_r["User_ID"] = lIDs
start, end = 0., 100
for col in columns[1:-1]:
    df_r[col] = set_random_number(start, end, length)

df_r[columns[-1]] = set_distribution_column(30, 15, length,)
df_r.to_csv("./data/random_dist.csv", index=False)
print(len(df_r["R8"].unique()))