In [2]:
import pandas as pd
import numpy as np

from itertools import cycle

Порой возникает необъодимость протестировать код. Для этого полезно уметь генерировать синтетические данные. В этом ноутбуке я привел ряд практик, которые позволят делать это быстро и легко

# Полезные ресурсы:
https://towardsdatascience.com/generating-fake-data-with-pandas-very-quickly-b99467d4c618

## Создание данных в ручную

In [6]:
df = pd.DataFrame({
    'Name':['Tom', 'Brad', 'Kyle', 'Jerry'],
    'Age':[20, 21, 19, 18],
    'Height' : [6.1, 5.9, 6.0, 6.1]
})

df

Unnamed: 0,Name,Age,Height
0,Tom,20,6.1
1,Brad,21,5.9
2,Kyle,19,6.0
3,Jerry,18,6.1


Такой способ подойдет, если нужно быстро проверить функциональность, которая не требует большого разнообразия  и объема данных. И даже в этом случае 

## Рабочий вариант
Но долго считается, надо бы оптимизировать

In [3]:
N = 300

datetimes = pd.Series(pd.date_range(start='2021-07-01',end='2021-07-30', freq='S'))
dates = datetimes.dt.date.unique().tolist()
times = datetimes.dt.time.unique().tolist()

groups_list = ['A', 'B', 'C']
subgroup_list = ['a', 'b', 'c' , 'd']

value_min = 1
value_max = 100

df = pd.DataFrame({
    'date': [np.random.choice(dates, 1, replace=True)[0] for i in range(N)],
    'time': [np.random.choice(times, 1, replace=True)[0] for i in range(N)],
    'group': [np.random.choice(groups_list, 1, replace=True)[0] for i in range(N)],
    'sub_group': [np.random.choice(subgroup_list, 1, replace=True)[0] for i in range(N)],
    'value': [np.random.choice(range(value_min, value_max+1), 1)[0] for i in range(N)]
}).sort_values(['date', 'time'])

## Сгенерировать имена

In [29]:
from faker.providers.person.en import Provider

def random_names(name_type, size):
    """
    Generate n-length ndarray of person names.
    name_type: a string, either first_names or last_names
    """
    names = getattr(Provider, name_type)
    return np.random.choice(names, size=size)

## Сгенерировать пол

In [24]:
def random_genders(size, p=(0.49, 0.49, 0.01, 0.01)):
    """Generate n-length ndarray of genders."""
    gender = ("M", "F", "O", "")
    return np.random.choice(gender, size=size, p=p)

## Сгенерировать дату

In [25]:
def random_dates(start, end, size):
    """
    Generate random dates within range between start and end.    
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

In [27]:
size = 100  
df = pd.DataFrame(columns=['First', 'Last', 'Gender', 'Birthdate'])
df['First'] = random_names('first_names', size)
df['Last'] = random_names('last_names', size) 
df['Gender'] = random_genders(size)
df['Birthdate'] = random_dates(start=pd.to_datetime('1940-01-01'), end=pd.to_datetime('2008-01-01'), size=size)

df.head()

Unnamed: 0,First,Last,Gender,Birthdate
0,Abdul,Heathcote,M,1941-07-01
1,Alfred,Schulist,F,1984-03-03
2,Alonza,Blanda,F,1947-04-19
3,Maryanne,Willms,F,1943-05-01
4,Lott,Vandervort,F,2002-09-09


# Генерирующая функция

In [7]:
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
    
    categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
                       'names'  : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
                       'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
                       'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
                      }
    
    default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
    rng = np.random.default_rng(seed)
    

    first_c = default_intervals["c"][0]
    categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
    default_intervals["c"] = (categories_names, default_intervals["c"][1])
    
    if isinstance(col_names,list):
        assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
    elif col_names is None:
        suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
        col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]

    if isinstance(intervals,list):
        assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
    else:
        if isinstance(intervals,dict):
            assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
            default_intervals.update(intervals)
        intervals = [default_intervals[col] for col in cols]
        
        
    df = pd.DataFrame()
    for col, col_name, interval in zip(cols, col_names, intervals):
        
        if interval is None:
            interval = default_intervals[col]
        assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
        
        if col in ("i","f","d"):
            start, end = interval
        if col == "i":
            df[col_name] = rng.integers(start, end, size)
        elif col == "f":
            df[col_name] = rng.uniform(start, end, size)
        
        elif col == "c":
            if isinstance(interval, list):
                categories = np.array(interval)
            else:
                cat_family, length = interval
                if isinstance(cat_family, cycle):
                    cat_family = next(cat_family)
                assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
                categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
            df[col_name] = rng.choice(categories, size, shuffle = True)
        
        elif col == "d":
            df[col_name] = rng.choice(pd.date_range(start, end), size)
            
    return df       

In [8]:
generate_fake_dataframe(size = 1000, cols =  "cififficcd")

Unnamed: 0,column_0_cat,column_1_int,column_2_float,column_3_int,column_4_float,column_5_float,column_6_int,column_7_cat,column_8_cat,column_9_date
0,Barbara,9,80.849928,0,6.012882,98.317174,3,rabbit,Manila,2020-08-23
1,Linda,6,33.455242,9,95.104453,78.024633,5,shrimp,Berlin,2020-10-02
2,James,5,21.705653,9,4.823146,86.502068,7,rabbit,Oslo,2020-10-22
3,Linda,4,48.691467,0,12.747958,72.226896,1,rabbit,Manila,2020-09-02
4,Mary,6,44.842061,0,34.605267,64.637698,3,fish,Lisbon,2020-04-18
...,...,...,...,...,...,...,...,...,...,...
995,Barbara,2,76.069030,9,0.301378,75.732919,1,duck,Lisbon,2020-06-06
996,Barbara,2,72.717920,2,43.057862,81.271704,0,fish,Oslo,2020-10-05
997,Linda,9,44.116419,4,72.840126,6.645776,7,turkey,Berlin,2020-02-28
998,Barbara,6,87.901017,8,98.724163,59.319201,3,duck,Palermo,2020-10-22


In [9]:
df1 = generate_fake_dataframe(
    size = 10, 
    cols = "cccfd", 
    col_names=["name", "pet", "city","height", "birthday"],
    intervals = {"f" : (1.72,1.95), "d" : ("1996-01-01","1996-12-31")},
    seed=42
)

df1

Unnamed: 0,name,pet,city,height,birthday
0,Ahmed,crab,Oslo,1.877101,1996-02-26
1,Elizabeth,crab,Marseille,1.891295,1996-07-25
2,Juan Luis,crab,Marseille,1.942527,1996-09-02
3,Mary,chicken,Marseille,1.79494,1996-12-10
4,Mary,pig,Milwaukee,1.805206,1996-06-08
5,Mary,crab,Berlin,1.827998,1996-02-28
6,Mary,duck,Moscow,1.763578,1996-10-31
7,Elizabeth,pig,Moscow,1.749882,1996-08-18
8,Ahmed,duck,Milwaukee,1.829412,1996-09-13
9,Juan Luis,duck,Moscow,1.772189,1996-02-05


In [14]:
df2 = generate_fake_dataframe(
  size = 30, 
  cols = "cicffcd", 
  col_names = ["user", "age", "residence", "weight","height", "pet", "registered"],
  intervals = [("names",15), (18,25),("cities", 15), (73.2,95.0),
               (1.65,1.95), ("animals", 11), None],
  seed = None)

df2.head(10)

Unnamed: 0,user,age,residence,weight,height,pet,registered
0,Mary,19,Palermo,90.333065,1.919427,pig,2020-07-01
1,Elizabeth,21,Manila,91.802986,1.676081,duck,2020-07-29
2,Ahmed,20,Oslo,92.548809,1.707564,shrimp,2020-07-28
3,William,24,Denver,84.665517,1.949388,pig,2020-12-31
4,Richard,18,Bogotá,94.091287,1.774938,deer,2020-03-27
5,Susan,20,Río de Janeiro,91.723516,1.944154,goat,2020-03-21
6,John,18,Bogotá,78.751711,1.677751,chicken,2020-10-22
7,Ahmed,22,Madrid,77.060393,1.668359,goat,2020-03-10
8,Elizabeth,22,Palermo,91.924544,1.78655,duck,2020-07-18
9,William,19,Marseille,82.330672,1.809691,deer,2020-10-04


In [17]:
df3 = generate_fake_dataframe(
    size = 20,
    cols = "cfcc",
    col_names = ["runner", "total_time", "skill", "adult"],
    intervals = [
        ("names", 15),
        (70,100),
        ["stamina", "endurance", "speed"],
        [True, False],
    ]
)

df3.head(10)

Unnamed: 0,runner,total_time,skill,adult
0,Barbara,96.832613,endurance,False
1,Robert,98.62194,stamina,False
2,Elizabeth,90.850014,endurance,False
3,Susan,79.51677,endurance,True
4,William,98.178075,speed,False
5,Ahmed,87.319557,speed,True
6,Linda,82.272694,stamina,True
7,Juan Luis,77.42533,speed,False
8,Barbara,96.55205,speed,False
9,Robert,95.423712,endurance,False


In [18]:
df4 = generate_fake_dataframe(
    size = 20,
    cols = "ifccc",
    col_names = ["id", "target", "one_hot_1", "one_hot_2", "one_hot_3"],
    intervals = {"c" : [0,1]}
)

df4

Unnamed: 0,id,target,one_hot_1,one_hot_2,one_hot_3
0,1,54.514018,0,1,1
1,7,50.004263,0,0,1
2,4,99.574484,1,0,1
3,6,49.298873,1,1,1
4,4,39.752972,0,0,1
5,9,27.386997,1,1,0
6,0,5.189211,0,0,0
7,8,28.502998,1,0,0
8,0,74.035495,0,1,1
9,4,52.992017,1,0,1
