### Installing https://github.com/ul-mds/gecko/tree/main

In [None]:
!pip install gecko-syndata

### Importing libraries

In [179]:
from gecko import generator, mutator
import pandas as pd
import numpy as np
import random

generate_phone_number_poland --> custom func created as an example for generator.from_func option.

generate_data_frame --> creating attributes, for "from_freq_table" csv file is necessary. value_column and freq_column are headers from csv,

mutate_data_frame --> using df from the first func, possible mutator options are ["with_cldr_keymap_file",
    "with_phonetic_replacement_table",
    "with_replacement_table",
    "with_missing_value",
    "with_insert",
    "with_delete",
    "with_transpose",
    "with_substitute",
    "with_edit",
    "with_noop",
    "with_categorical_values",
    "with_function",
    "with_permute",
    "with_lowercase",
    "with_uppercase",
    "with_datetime_offset",]

In [185]:
def generate_phone_number_poland():
  """Randomly generate a Polish telephone number made of a two-digit prefix and seven-digit number."""
  num = []

  prefix = random.choice(["45","50", "51", "53", "57", "60", "66", "69", "72", "73", "78", "79", "88"] )
  num.append(prefix)

  for _ in range(7):
    number1 = random.randint(0, 9)
    num.append(str(number1))
  return "".join(num)


def generate_data_frame(count: int, rng: np.random.Generator):
  gen_first_name = generator.from_frequency_table(
      "/content/polish_female_firstnames.csv",
      value_column="first_name",
      freq_column='count',
      rng=rng
  )

  gen_last_name = generator.from_frequency_table(
      "/content/polish_female_surnames.csv",
      value_column="surname",
      freq_column='count',
      rng=rng
  )

  gen_city = generator.from_frequency_table(
      "/content/city_freq.csv",
      value_column="city",
      freq_column='count',
      rng=rng
  )

  gen_postcode = generator.from_frequency_table(
      "/content/pl_state_district_mapping.csv",
      value_column="postcode",
      freq_column='count',
      rng=rng
  )

  gen_age = generator.from_uniform_distribution(
      low=18,
      high=100,
      precision=0,
      rng=rng
  )

  gen_income = generator.from_normal_distribution(
      mean=7000,
      sd=2000,
      precision=2,
      rng=rng
  )

  gen_phone_number = generator.from_function(
      func=generate_phone_number_poland
  )

  return generator.to_data_frame(
      {
          'first_name':gen_first_name,
          'last_name':gen_last_name,
          'city':gen_city,
          'postcode':gen_postcode,
          'age':gen_age,
          'income':gen_income,
          'phone_number':gen_phone_number,
      },
      count
  )

def mutate_data_frame(df: pd.DataFrame, rng: np.random.Generator):
  return mutator.mutate_data_frame(
      df,
      [# (probability of mutation, mutator)
       # one attribue can have multiple mutators (list of tuples)
        ('first_name', [
          (0.2, mutator.with_delete(rng=rng))
        ]),
        ('last_name', [
          (0.2, mutator.with_insert(rng=rng))
        ]),
        ('city', [
          (0.3, mutator.with_lowercase())
        ]),
        ('postcode', [
          (0.2, mutator.with_delete(rng=rng))
        ]),
        ('age', [
          (0.3, mutator.with_missing_value(np.nan, "all"))
        ]),
        ('income', [
          (0.2, mutator.with_edit(rng=rng))
        ]),
        ('phone_number', [
          (0.2, mutator.with_missing_value(np.nan, 'all'))
        ]),
      ],
      rng
  )

In [186]:
count = 1000 # number of rows to generate
rng = np.random.default_rng(42)

generated_df = generate_data_frame(count, rng=rng)
mutated_df = mutate_data_frame(df=generated_df, rng=rng)

In [187]:
generated_df

Unnamed: 0,first_name,last_name,city,postcode,age,income,phone_number
0,Mirela,Brzezińska,Piła,61-556,77,4802.15,450617008
1,Nina,Maj,Konin,60-858,47,5917.37,694145740
2,Cyryla,Czerwińska,Szamotuły,60-142,88,7544.00,502517822
3,Balbina,Domańska,Gniezno,61-387,38,5743.36,504209735
4,Genowefa,Przybylska,Poznań,63-740,62,6443.27,781818985
...,...,...,...,...,...,...,...
995,Nina,Szewczyk,Konin,62-213,36,4441.22,669017802
996,Kunegunda,Kamińska,Gniezno,64-412,32,9221.13,530083184
997,Maria,Kowalska,Poznań,60-572,57,5995.75,518741917
998,Ludwina,Walczak,Ostrów Wielkopolski,60-328,47,5705.90,785076560


In [188]:
mutated_df

Unnamed: 0,first_name,last_name,city,postcode,age,income,phone_number
0,Mirela,Brzezińska,Piła,61-556,77,4802.15d,450617008
1,Nin,Mpaj,konin,60-858,47,5917.37,
2,Cyryla,Czerwińska,Szamotuły,60-142,88,7544.00,
3,Balbina,DomańskaL,Gniezno,61-387,38,5743.36,504209735
4,Genowefa,PrSzybylska,poznań,63-740,62,64432.7,781818985
...,...,...,...,...,...,...,...
995,Nina,Szewczyk,konin,62-213,36,4441.22,669017802
996,Knegunda,Kamińska,gniezno,64-412,,9221.13d,530083184
997,Mria,Kowalska,Poznań,60-572,57,5995.7G5,518741917
998,Ludwina,Walczak,ostrów wielkopolski,60-328,47,5705.90,785076560


In [191]:
generated_df.to_csv("/content/GECKO_GENERATION_RESULT.csv")

In [189]:
mutated_df.to_csv("/content/GECKO_CORRUPTION_RESULT.csv")