In [1]:
%pip install SDV

Note: you may need to restart the kernel to use updated packages.


In [22]:
import sdv

print(sdv.version.public)

1.11.0


In [66]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath='data/CompetenceContractorFullJoin.csv')

python_dict = metadata.to_dict()
metadata.remove_primary_key() #satte primarykey till firstname
print(metadata)
#metadata.visualize(
#    show_table_details='full',
#    output_filepath='my_metadata.png'
#)

{
    "columns": {
        "ContractorId": {
            "sdtype": "numerical"
        },
        "ContractorGUID": {
            "sdtype": "categorical"
        },
        "ContractorSource": {
            "sdtype": "categorical"
        },
        "ContractorSourceId": {
            "sdtype": "numerical"
        },
        "FirstName": {
            "sdtype": "first_name",
            "pii": true
        },
        "LastName": {
            "sdtype": "last_name",
            "pii": true
        },
        "PhotoUrl": {
            "sdtype": "categorical"
        },
        "Nationality": {
            "sdtype": "numerical"
        },
        "BirthDay": {
            "sdtype": "numerical"
        },
        "ContractorUpdated": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d %H:%M:%S.%f"
        },
        "CompetenceId": {
            "sdtype": "numerical"
        },
        "CompetenceGUID": {
            "sdtype": "unknown",
            "pii": true
   

In [67]:
from sdv.datasets.local import load_csvs
real_data_dict = load_csvs(
    folder_name='data',
    read_csv_parameters={
        'skipinitialspace': True,
        'encoding': 'utf_8'
    })
table_key = 'CompetenceContractorFullJoin'
real_data = real_data_dict[table_key] #ingen aning om varför





In [68]:
from sdv.single_table import CTGANSynthesizer
synthesizer = CTGANSynthesizer(metadata)

synthesizer.fit(real_data)




In [6]:
synthesizer.get_transformers()

{'ContractorId': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'ContractorGUID': None,
 'ContractorSource': None,
 'ContractorSourceId': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'FirstName': AnonymizedFaker(provider_name='person', function_name='first_name', locales=['en_US']),
 'LastName': AnonymizedFaker(provider_name='person', function_name='last_name', locales=['en_US']),
 'PhotoUrl': None,
 'Nationality': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'BirthDay': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'ContractorUpdated': UnixTimestampEncoder(datetime_format='%Y-%m-%d %H:%M:%S.%f', enforce_min_max_values=True),
 'CompetenceId': IDGenerator(),
 'CompetenceGUID': AnonymizedFaker(function_name='bothify', function_kwargs={'text': 'sdv-pii-?????', 'letters': '0123456789abcdefghijklmnopqrstuvwxyz'}),
 'CompetenceSourceId': AnonymizedFaker(function_name='bothify', fu

In [74]:
synthetic_data = synthesizer.sample(num_rows=100)
synthetic_data.head()


Unnamed: 0,ContractorId,ContractorGUID,ContractorSource,ContractorSourceId,FirstName,LastName,PhotoUrl,Nationality,BirthDay,ContractorUpdated,...,CompetenceSource,Acquired,Expires,Lang,Qualification_Id,Contractor_Id,Created,CreatedBy,Modified,ModifiedBy
0,405485,21c5119d-e8a1-43ec-a7a0-0045b1a98ae1,0,214612,Jill,Garcia,https://ssgcardissuestaging.blob.core.windows....,,,2023-09-20 15:18:29.192236,...,,2014-11-25 17:00:28.757724,sdv-pii-qh1w2,,31.0,405374.0,2023-09-20 17:45:33.050339,siteaccesscontrol,,
1,405337,967858cc-5e00-474d-9b7c-003fcc428dc8,0,52440,Ronald,Simmons,https://ssgcardissuestaging.blob.core.windows....,,,2023-09-20 15:18:29.192236,...,Ssg,2013-03-20 20:07:19.370738,sdv-pii-crxx8,,135.0,405453.0,2023-09-20 19:26:33.979373,siteaccesscontrol,,
2,405464,53dab2b6-5d3b-4b95-9638-0002199a45cb,0,148890,Rebecca,Hernandez,https://ssgcardissuestaging.blob.core.windows....,,,2023-09-20 15:18:29.192236,...,Ssg,2017-12-11 03:24:46.858807,sdv-pii-bxsv1,46.0,194.0,405425.0,2023-09-20 20:00:29.616774,siteaccesscontrol,,
3,405459,e3e9ef2a-ebe2-40fd-8bf9-00c2ebab46d7,0,210365,Andrew,Anderson,https://ssgcardissuestaging.blob.core.windows....,,,2023-09-20 15:18:29.192236,...,Ssg,2019-07-19 11:56:11.528611,sdv-pii-gpazd,46.0,199.0,405330.0,2023-09-20 18:14:56.465353,siteaccesscontrol,,
4,405494,bb84a135-9a57-4bbf-afa8-0094f376a11b,0,6253,Abigail,Proctor,https://ssgcardissuestaging.blob.core.windows....,,,2023-09-30 21:18:54.301026,...,Ssg,2019-11-25 07:52:51.492999,sdv-pii-rpd66,44.0,454.0,405375.0,2023-09-20 20:26:37.192335,siteaccesscontrol,,


In [75]:
synthetic_data.to_csv('synthetic_data.csv', index=False)

In [73]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 23/23 [00:00<00:00, 1140.04it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 486.07it/s]

Overall Score: 99.61%

Properties:
- Data Validity: 99.21%
- Data Structure: 100.0%


  return valid.sum() / len(synthetic_data)


In [74]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 23/23 [00:00<00:00, 1852.61it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 253/253 [00:01<00:00, 140.27it/s]

Overall Score: 80.04%

Properties:
- Column Shapes: 82.85%
- Column Pair Trends: 77.23%


In [16]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,ContractorId,KSComplement,0.73,
1,ContractorGUID,TVComplement,0.87,
2,ContractorSource,TVComplement,1.0,
3,ContractorSourceId,KSComplement,0.8,
4,PhotoUrl,TVComplement,0.94,
5,Nationality,KSComplement,,ValueError: Data passed to ks_2samp must not b...
6,BirthDay,KSComplement,,ValueError: Data passed to ks_2samp must not b...
7,ContractorUpdated,KSComplement,0.56,
8,CompetenceSource,TVComplement,1.0,
9,Acquired,KSComplement,0.83,


In [76]:
import pandas as pd
import uuid
from datetime import timedelta


synthetic_data = pd.read_csv('synthetic_data.csv')


competence_columns = [
    'CompetenceId', 'CompetenceGUID', 'CompetenceSourceId', 'CompetenceSource',
    'Acquired', 'Expires', 'Lang', 'Qualification_Id', 'Contractor_Id', 'Created',
    'CreatedBy', 'Modified', 'ModifiedBy'
]

contractor_columns = [
    'ContractorId', 'ContractorGUID', 'ContractorSource', 'ContractorSourceId',
    'FirstName', 'LastName', 'PhotoUrl', 'Nationality', 'BirthDay', 'ContractorUpdated'
]

competence_data = synthetic_data[competence_columns].rename(columns={
    'CompetenceId': 'Id',
    'CompetenceGUID': 'CompetenceId',
    'CompetenceSourceId': 'SourceId',
    'CompetenceSource': 'Source'
})

contractor_data = synthetic_data[contractor_columns].rename(columns={
    'ContractorId': 'Id',
    'ContractorGUID' : 'ContractorID',
    'ContractorSource': 'Source',
    'ContractorSourceId': 'SourceId'
})


In [77]:
def format_columns_as_uuids(table, column_names):
    for column in column_names:
        table[column] = [str(uuid.uuid4()) for _ in range(len(table))]
    return table

competence_data = format_columns_as_uuids(competence_data, [ 'SourceId'])
competence_data = format_columns_as_uuids(competence_data, ['CompetenceId'])
contractor_data = format_columns_as_uuids(contractor_data, ['ContractorId', 'SourceId'])

competence_data['Acquired'] = pd.to_datetime(competence_data['Acquired'])

competence_data['Expires'] = competence_data['Acquired'] + pd.DateOffset(years=2)

contractor_data['BirthDay'] = 'NULL'
competence_data['Modified'] = 'NULL'
competence_data['ModifiedBy'] = 'NULL'

competence_data['CreatedBy'] = 'CTGAN'
contractor_data['PhotoUrl'] = 'CTGAN'

In [64]:

contractor_data.head(3)

Unnamed: 0,Id,ContractorID,Source,SourceId,FirstName,LastName,PhotoUrl,Nationality,BirthDay,ContractorUpdated,ContractorId
0,406984,04379f5a-4042-495c-b558-06b3100cff44,0,b5006490-a5e4-461e-9dd4-a47ab505c567,Crystal,Martinez,CTGAN,,,2023-09-20 15:18:29.192236,ded51f0d-7844-4ec3-97ea-454c5bd19793
1,406309,30a7feb5-ba6c-4284-813e-053b9935b748,0,f1da01b5-adaf-48a3-a265-f47f470cd41e,Rachel,Erickson,CTGAN,,,2023-09-20 15:18:29.192236,c63f39db-20df-4b2c-aa2e-b339b28d7904
2,405729,ce9b421f-e7fc-4ef7-b2e0-05f57dfe1c26,0,4255be15-72b4-4a67-a220-f51f19b120e8,Antonio,Cisneros,CTGAN,,,2023-09-20 15:18:29.192236,673d8281-509c-4a0f-b72d-c569e95e3c5d


In [79]:
competence_data.to_csv('competence_CTGANdata_1000_100.csv', index=False)
contractor_data.to_csv('contractor_CTGANdata_1000_100.csv', index=False)