### import libraries

In [1]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from ctgan import CTGAN

  from .autonotebook import tqdm as notebook_tqdm


### prepare data

In [5]:
tama_df = pd.read_csv('geocode_tama_address_data.csv')
tama_df['years_to_be_approved'] = tama_df['ShnatHryter'] - tama_df['ShnatBakasha']

### choose features

In [7]:
# consider to change the chosen_features list
chosen_features = ['ShnatBakasha', 'latitude', 'longitude', 'years_to_be_approved']
tama_df = tama_df[chosen_features]
tama_df = tama_df.dropna(how='any')

Unnamed: 0,ShnatBakasha,latitude,longitude,years_to_be_approved
0,2013.0,31.852940,34.845640,2.0
1,2014.0,31.854477,34.845127,7.0
10,2006.0,31.796681,34.657435,1.0
11,2008.0,31.812108,34.646415,1.0
12,2010.0,31.811985,34.647032,0.0
...,...,...,...,...
10501,2018.0,32.160547,34.911997,2.0
10502,2015.0,32.158662,34.890109,5.0
10503,2017.0,32.153654,34.890050,3.0
10504,2014.0,32.153311,34.890399,8.0


### create the synthetic data

In [11]:
# Initialize synthesizer
synthesizer = CTGAN()
# Fit synthesizer to data
synthesizer.fit(tama_df)
# Generate synthetic data
size_synthetic_data = len(tama_df) * 10 # consider to change the size
synthetic_data = synthesizer.sample(size_synthetic_data)
synthetic_data['years_to_be_approved'] = synthetic_data['years_to_be_approved'].round(0)
synthetic_data.shape

(49840, 4)

### Save synthetic data to file

In [14]:
synthetic_data.to_csv('tama_synthetic_data.csv', index=False)

### mix real data with the fake data

In [15]:
synthetic_data['is_real'] = False
tama_df['is_real'] = True
mix_tama_df = pd.concat([synthetic_data, tama_df])

### Save mix data to file

In [16]:
mix_tama_df.to_csv('mix_tama_data.csv', index=False)