### import libraries

In [3]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from ctgan import CTGAN

### prepare data

In [5]:
tama_df = pd.read_csv('engineered_tama.csv')
tama_df

Unnamed: 0.1,Unnamed: 0,MezheYeshut,GISId,MisparYeshut,MisparYeshutAv,AchuzHachtamot,AchuzHachtamotMitcham,Bayt,GushChelka,KamutDirot,...,Yeshut,Yeshuv,BakashaLeHeter,Heter1,Ichlus1,address,coordinates,latitude,longitude,years_to_be_approved
0,0,16-640988,640988,16000146,,,,1,,,...,"תמ""א",מזכרת בתיה,2013-08-28,2015-11-17,2020-03-24,"אלון יגאל 1, מזכרת בתיה","(31.8529404, 34.8456398)",31.852940,34.845640,2.0
1,1,16-640984,640984,16002161,,,,23,,,...,"תמ""א",מזכרת בתיה,2014-03-24,2021-10-24,,"שד אליהו 23, מזכרת בתיה","(31.854477, 34.8451273)",31.854477,34.845127,7.0
2,2,16-640985,640985,16000145,,,0.0,32,,,...,"תמ""א",מזכרת בתיה,2016-11-14,,,"שד אליהו 32, מזכרת בתיה","(31.854044, 34.8453213)",31.854044,34.845321,
3,3,16-640989,640989,16000147,,,,5,,,...,"תמ""א",מזכרת בתיה,2017-06-06,,,"אלון יגאל 5, מזכרת בתיה","(31.8528759, 34.84637980000001)",31.852876,34.846380,
4,4,16-649233,649233,16001567,,,,28,,,...,"תמ""א",מזכרת בתיה,2019-09-16,,,"שד אליהו 28, מזכרת בתיה","(31.8541366, 34.8447116)",31.854137,34.844712,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10548,10548,16-646926,646926,16009122,,,,34,,,...,"תמ""א",הוד השרון,2016-03-17,,,"ישורון 34, הוד השרון","(32.1587622, 34.8930285)",32.158762,34.893028,
10549,10549,16-648376,648376,16009219,,57.14,8.0,13,,,...,"תמ""א",הוד השרון,,,,"בן גמלא יהושע 13, הוד השרון","(32.1432137, 34.8763687)",32.143214,34.876369,
10550,10550,16-654519,654519,16009567,,,,3,,,...,"תמ""א",הוד השרון,,,,"פדויים 3, הוד השרון","(32.1577059, 34.9006436)",32.157706,34.900644,
10551,10551,16-655843,655843,16010236,,,,6,,,...,"תמ""א",הוד השרון,,,,"עין חי 6, הוד השרון","(32.1567005, 34.8937613)",32.156700,34.893761,


### choose features

In [7]:
# consider to change the chosen_features list
chosen_features = ['ShnatBakasha', 'latitude', 'longitude', 'years_to_be_approved']
tama_df = tama_df[chosen_features]
tama_df = tama_df.dropna(how='any')

Unnamed: 0,ShnatBakasha,latitude,longitude,years_to_be_approved
0,2013.0,31.852940,34.845640,2.0
1,2014.0,31.854477,34.845127,7.0
10,2006.0,31.796681,34.657435,1.0
11,2008.0,31.812108,34.646415,1.0
12,2010.0,31.811985,34.647032,0.0
...,...,...,...,...
10501,2018.0,32.160547,34.911997,2.0
10502,2015.0,32.158662,34.890109,5.0
10503,2017.0,32.153654,34.890050,3.0
10504,2014.0,32.153311,34.890399,8.0


### create the synthetic data

In [11]:
# Initialize synthesizer
synthesizer = CTGAN()
# Fit synthesizer to data
synthesizer.fit(tama_df)
# Generate synthetic data
size_synthetic_data = len(tama_df) * 10 # consider to change the size
synthetic_data = synthesizer.sample(size_synthetic_data)
synthetic_data['years_to_be_approved'] = synthetic_data['years_to_be_approved'].round(0)
synthetic_data.shape

(49840, 4)

### Save synthetic data to file

In [14]:
synthetic_data.to_csv('tama_synthetic_data.csv', index=False)

### mix real data with the fake data

In [15]:
synthetic_data['is_real'] = False
tama_df['is_real'] = True
mix_tama_df = pd.concat([synthetic_data, tama_df])

### Save mix data to file

In [16]:
mix_tama_df.to_csv('mix_tama_data.csv', index=False)