### Installing https://github.com/T-Strojny/geco3/tree/main

In [None]:
!pip install git+https://github.com/T-Strojny/geco3.git

Importing libraries, setting seed

In [None]:
from geco3 import basefunctions, attrgenfunct, contdepfunct
from geco3 import generator, corruptor

import random
random.seed(42)

Unicode used for generation, available options --> # http://docs.python.org/library/codecs.html#standard-encodings

In [None]:
unicode_encoding_used = 'utf-8'

rec_id_attr_name --> prefix of the ID column


In [None]:
rec_id_attr_name = 'rec-id'
out_file_name = 'GECO3_CORRUPTION_RESULT_2.csv'

set how many records and new corrupted records to generate

In [None]:
num_org_rec = 1000
num_dup_rec = 1000

In [None]:
max_duplicate_per_record = 3
num_duplicates_distribution = 'zipf'

In [None]:
max_modification_per_attr = 1
num_modification_per_record = 5

In [None]:
basefunctions.check_unicode_encoding_exists(unicode_encoding_used)

### Creating attributes

Data for female given names --> https://www.kaggle.com/datasets/djablo/list-of-polish-first-and-last-names?resource=download

Surnames generated by AI

In [None]:
given_name_attr = generator.GenerateFreqAttribute(
                          attribute_name = 'given-name',
                          freq_file_name = '/content/polish_female_firstnames.csv',
                          has_header_line = True,
                          unicode_encoding = unicode_encoding_used
                          )

In [None]:
surname_attr = generator.GenerateFreqAttribute(
                          attribute_name = 'surname',
                          freq_file_name = '/content/polish_female_surnames.csv',
                          has_header_line = True,
                          unicode_encoding = unicode_encoding_used
                          )

postcode data --> https://www.kaggle.com/datasets/ravikanth/poland-postal-codes-zip-state-counties-districts?select=pl_state_district_mapping.csv

In [None]:
postcode_attr = generator.GenerateFreqAttribute(
                          attribute_name = 'postcode',
                          freq_file_name = '/content/pl_state_district_mapping.csv',
                          has_header_line = True,
                          unicode_encoding = unicode_encoding_used
                          )

custom function created for the example

In [None]:
phone_number_attribute = generator.GenerateFuncAttribute(
                                  attribute_name = 'telephone-number',
                                  function = attrgenfunct.generate_phone_number_poland
                      )

In [None]:
credit_card_attr =  \
    generator.GenerateFuncAttribute(attribute_name = 'credit-card-number',
                       function = attrgenfunct.generate_credit_card_number)

age_uniform_attr = \
    generator.GenerateFuncAttribute(attribute_name = 'age-uniform',
                       function = attrgenfunct.generate_uniform_age,
                       parameters = [0,100])

In [None]:
income_normal_attr = \
    generator.GenerateFuncAttribute(attribute_name = 'income-normal',
                       function = attrgenfunct.generate_normal_value,
                       parameters = [50000,20000, 0, 1000000, 'float2'])

In [None]:
city_attr = generator.GenerateFreqAttribute(
                      attribute_name = "city",
                      freq_file_name = "/content/city_freq.csv",
                      has_header = True,
                      unicode_encoding = unicode_encoding_used,
                      )

### Creating variables for corruptor

In [None]:
edit_corruptor = corruptor.CorruptValueEdit(
          position_function = corruptor.position_mod_normal,
          char_set_funct = basefunctions.char_set_ascii,
          insert_prob = 0.5,
          delete_prob = 0.5,
          substitute_prob = 0.0,
          transpose_prob = 0.0
          )

edit_corruptor2 = corruptor.CorruptValueEdit(\
          position_function = corruptor.position_mod_uniform,
          char_set_funct = basefunctions.char_set_ascii,
          insert_prob = 0.25,
          delete_prob = 0.25,
          substitute_prob = 0.25,
          transpose_prob = 0.25
          )

In [None]:
surname_misspell_corruptor = \
    corruptor.CorruptCategoricalValue(\
          lookup_file_name = '/content/misspell_polish_surnames.csv',
          has_header_line = True,
          unicode_encoding = unicode_encoding_used)

In [None]:
ocr_corruptor = corruptor.CorruptValueOCR(
          position_function = corruptor.position_mod_normal,
          lookup_file_name = '/content/ocr-variations.csv',
          has_header_line = True,
          unicode_encoding = unicode_encoding_used
          )

keyboard_corruptor = corruptor.CorruptValueKeyboard(
          position_function = corruptor.position_mod_normal,
          row_prob = 0.5,
          col_prob = 0.5
          )

phonetic_corruptor = corruptor.CorruptValuePhonetic(
          lookup_file_name = '/content/phonetic-variations.csv',
          has_header_line = True,
          unicode_encoding = unicode_encoding_used
          )

missing_val_corruptor = corruptor.CorruptMissingValue()

postcode_missing_val_corruptor = corruptor.CorruptMissingValue(
       missing_val='missing'
       )

given_name_missing_val_corruptor = corruptor.CorruptMissingValue(
       missing_value='unknown'
       )

In [None]:
attr_name_list = ['given-name', 'surname', 'postcode', 'city',
                  'telephone-number', 'credit-card-number', 'income-normal',
                  'age-uniform']

attr_data_list = [given_name_attr, surname_attr, postcode_attr, city_attr, phone_number_attribute,
                  credit_card_attr, age_uniform_attr, income_normal_attr]

Generating dataset

In [None]:
test_data_generator = generator.GenerateDataSet(
                                          output_file_name = out_file_name,
                                          write_header_line = True,
                                          rec_id_attr_name = rec_id_attr_name,
                                          number_of_records = num_org_rec,
                                          attribute_name_list = attr_name_list,
                                          attribute_data_list = attr_data_list,
                                          unicode_encoding = unicode_encoding_used
                                          )

set probability of corrupting single record, must sum to 1

In [None]:
attr_mod_prob_dictionary = {'given-name':0.2,
                            'surname':0.2,
                            'postcode':0.1,
                            'city':0.1,
                            'income-normal': 0.1,
                            'telephone-number':0.10,
                            'credit-card-number':0.15,
                            'age-uniform':0.05
                            }

Set probabilities of which corruptor to use, must sum to 1

In [None]:
attr_mod_data_dictionary = {'surname':[(0.1, surname_misspell_corruptor),
                                       (0.1, ocr_corruptor),
                                       (0.1, keyboard_corruptor),
                                       (0.7, phonetic_corruptor)],
                            'given-name':[(0.1, edit_corruptor2),
                                          (0.1, ocr_corruptor),
                                          (0.1, keyboard_corruptor),
                                          (0.7, phonetic_corruptor)],
                            'postcode':[(0.8, keyboard_corruptor),
                                        (0.2, postcode_missing_val_corruptor)],
                            'city':[(0.1, edit_corruptor),
                                    (0.1, missing_val_corruptor),
                                    (0.4, keyboard_corruptor),
                                    (0.4, phonetic_corruptor)],
                            'income-normal':[(0.4, missing_val_corruptor),
                                             (0.6, edit_corruptor)],
                            'age-uniform':[(1.0, edit_corruptor2)],
                            'telephone-number':[(1.0, missing_val_corruptor)],
                            'credit-card-number':[(1.0, edit_corruptor)]}


corrupt dataset

In [None]:
test_data_corruptor = corruptor.CorruptDataSet(number_of_org_records = num_org_rec,
                                                number_of_mod_records = num_dup_rec,
                                                attribute_name_list = attr_name_list,
                                                max_num_dup_per_rec = max_duplicate_per_record,
                                                num_dup_dist = num_duplicates_distribution,
                                                max_num_mod_per_attr = max_modification_per_attr,
                                                num_mod_per_rec = num_modification_per_record,
                                                attr_mod_prob_dict = attr_mod_prob_dictionary,
                                                attr_mod_data_dict = attr_mod_data_dictionary
                                                )

Run everything


In [None]:
rec_dict = test_data_generator.generate()

assert len(rec_dict) == num_org_rec  # Check the number of generated records

# Corrupt (modify) the original records into duplicate records
#
rec_dict = test_data_corruptor.corrupt_records(rec_dict)

assert len(rec_dict) == num_org_rec+num_dup_rec # Check total number of records

# Write generate data into a file
#
test_data_generator.write()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
      Modified attribute value: ''
  Selected attribute for modification: given-name
    Selected corruptor: OCR value
      Original attribute value: 'Amelia'
      Modified attribute value: 'Amelja'
  Selected attribute for modification: postcode
    Selected corruptor: Keybord value
      Original attribute value: '60-332'
      Modified attribute value: '60-33w'
  Selected attribute for modification: credit-card-number
    Selected corruptor: Edit operation
      Original attribute value: '0465 7482 4154 5876'
      Modified attribute value: '0465 7482 4154 58576'
Original record:
  ['Amelia', 'Nowicka', '60-332', 'Gniezno', '539205146', '0465 7482 4154 5876', '34436.26', '82']
Record with 5 modified attributes
(1 in given-name, 1 in surname, 1 in postcode, 1 in telephone-number, 1 in credit-card-number,):
  ['Amelja', 'Noicka', '60-33w', 'Gniezno', '', '0465 7482 4154 58576', '34436.26', '82']
829 of 1000 duplicate r

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/GECO3_CORRUPTION_RESULT.csv')
df.head(15)

Unnamed: 0,rec-id,given-name,surname,postcode,city,telephone-number,credit-card-number,income-normal,age-uniform
0,rec-0000-org,Eugenia,Baranowska,60-139,Gniezno,691416335.0,6758 5786 8463 8777,65161.02,1.0
1,rec-0001-org,Julianna,Rutkowska,61-059,Poznań,508275539.0,4409 2995 3116 8176,37997.72,74.0
2,rec-0002-dup-0,fecylia,Wajilewska,62-120,,,0109 48872 0693 7902,80722.63,37.0
3,rec-0002-org,Cecylia,Wasilewska,62-120,Gniezno,539521117.0,0109 8872 0693 7902,80722.63,37.0
4,rec-0003-dup-0,Lupomira,Góreka,6o-543,Poznań,,78737 8860 0534 8233,48330.41,35.0
5,rec-0003-dup-1,Lupomira,Góreka,60-543,Pojnań,576392086.0,7837 8860 05348233,48330.41,53.0
6,rec-0003-org,Lubomira,Górecka,60-543,Poznań,576392086.0,7837 8860 0534 8233,48330.41,35.0
7,rec-0004-dup-0,Wiejława,Zaadzka,60-76t,Poznań,,4255 3067 87988 4307,47458.99,24.0
8,rec-0004-dup-1,Wisława,Zaeadzka,60-765,Poznań,,4255 3067 988 4307,47458.99,42.0
9,rec-0004-org,Wiesława,Zawadzka,60-765,Poznań,453691559.0,4255 3067 8988 4307,47458.99,24.0
