In [62]:
import glob
from pprint import pprint
from pathlib import Path
import string
import random

import pandas as pd
from argon2 import PasswordHasher
from faker import Faker

from config import CONFIG
from utils import Utils

In [67]:
DATASET_PATH = str(("datasets/*/*/*/*/*"))
datasets = list(map(lambda x: Path(x), glob.glob(DATASET_PATH)))

pprint(f"Datasets:{datasets}")

("Datasets:[PosixPath('datasets/aprabowo/indonesia-tourism-destination/versions/1/package_tourism.csv'), "
 "PosixPath('datasets/aprabowo/indonesia-tourism-destination/versions/1/tourism_rating.csv'), "
 "PosixPath('datasets/aprabowo/indonesia-tourism-destination/versions/1/user.csv'), "
 "PosixPath('datasets/aprabowo/indonesia-tourism-destination/versions/1/tourism_with_id.csv')]")


In [68]:
df = {data_path.stem: pd.read_csv(data_path) for data_path in datasets}
print(df.keys())

dict_keys(['package_tourism', 'tourism_rating', 'user', 'tourism_with_id'])


## User Data

In [69]:
df["user"].head()

Unnamed: 0,User_Id,Location,Age
0,1,"Semarang, Jawa Tengah",20
1,2,"Bekasi, Jawa Barat",21
2,3,"Cirebon, Jawa Barat",23
3,4,"Bekasi, Jawa Barat",21
4,5,"Lampung, Sumatera Selatan",20


In [70]:
df['user']['dob'] = df['user']['Age'].apply(
    lambda age: Utils.calculate_dob_from_age(age)
)
df['user'].head()

Unnamed: 0,User_Id,Location,Age,dob
0,1,"Semarang, Jawa Tengah",20,2006-05-05
1,2,"Bekasi, Jawa Barat",21,2004-11-25
2,3,"Cirebon, Jawa Barat",23,2003-07-13
3,4,"Bekasi, Jawa Barat",21,2005-02-03
4,5,"Lampung, Sumatera Selatan",20,2006-03-07


In [82]:
df['user']['Province'] = df['user']['Location'].apply(
    lambda x: x.split(',')[1].strip() if pd.notnull(x) else x
)
df['user'].head()

Unnamed: 0,User_Id,Location,Age,dob,password,full_name,username,Province
0,1,"Semarang, Jawa Tengah",20,2006-05-05,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...","Ade Nasyiah, M.M.",adenasyia89,Jawa Tengah
1,2,"Bekasi, Jawa Barat",21,2004-11-25,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Nadine Mulyani,nadinemul25,Jawa Barat
2,3,"Cirebon, Jawa Barat",23,2003-07-13,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Elvin Gunawan,elvinguna55,Jawa Barat
3,4,"Bekasi, Jawa Barat",21,2005-02-03,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...","drg. Marwata Waskita, S.E.I",drgmarwa87,Jawa Barat
4,5,"Lampung, Sumatera Selatan",20,2006-03-07,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Drs. Lutfan Narpati,drslutfa82,Sumatera Selatan


In [71]:
password = CONFIG.USER_STARTER_PASSWORD.get_secret_value() # type: ignore


In [72]:
ph = PasswordHasher()

In [73]:
hashed = ph.hash(password)

In [74]:
ph.verify(hashed, password)

True

In [75]:
df['user']['password'] = hashed

In [76]:
fake = Faker("id_ID")  

In [83]:
df['user']['full_name'] = df['user'].apply(lambda row: fake.name(), axis=1)
df['user']['username'] = df['user']['full_name'].apply(
    lambda name: name[:10].lower().replace(' ', '_').translate(str.maketrans('', '', string.punctuation)) + str(random.randint(10, 99))
)
df['user'].head()

Unnamed: 0,User_Id,Location,Age,dob,password,full_name,username,Province
0,1,"Semarang, Jawa Tengah",20,2006-05-05,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Hardana Tarihoran,hardanata25,Jawa Tengah
1,2,"Bekasi, Jawa Barat",21,2004-11-25,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Dt. Bala Mulyani,dtbalam70,Jawa Barat
2,3,"Cirebon, Jawa Barat",23,2003-07-13,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Vicky Prasetya,vickypras41,Jawa Barat
3,4,"Bekasi, Jawa Barat",21,2005-02-03,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Genta Zulaika,gentazula74,Jawa Barat
4,5,"Lampung, Sumatera Selatan",20,2006-03-07,"$argon2id$v=19$m=65536,t=3,p=4$fPGg47n2ymHwC4c...",Ida Saptono,idasapton33,Sumatera Selatan


In [120]:
user_insert_sql = """
insert into public.users (
  user_id,
  username,
  full_name,
  password,
  dob,
  province
) values
"""

In [121]:
rows = []
for _, row in df['user'].iterrows():
    rows.append(
        f"({row['User_Id']}, '{row['username']}', '{row['full_name']}', '{row['password']}', '{row['dob']}', '{row['Province']}')"
    )
values_str = ",\n".join(rows)

In [122]:
with open("supabase/seeder/user.sql", "w") as f:
    f.write(user_insert_sql + values_str + ";")

## Tourism Place

In [97]:
df['tourism_with_id'].head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,,1
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,,2
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,,3
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,,4
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,,5


In [104]:
df['tourism_with_id'].columns 

Index(['Place_Id', 'Place_Name', 'Description', 'Category', 'City', 'Price',
       'Rating', 'Time_Minutes', 'Coordinate', 'Lat', 'Long', 'Unnamed: 11',
       'Unnamed: 12'],
      dtype='object')

In [107]:
df['tourism_with_id']['City'].value_counts()

City
Yogyakarta    126
Bandung       124
Jakarta        84
Semarang       57
Surabaya       46
Name: count, dtype: int64

In [108]:
city_province_mapping = {
    "Jakarta" : "Jakarta",
    "Bandung" : "Jawa Barat",
    "Yogyakarta" : "Yogyakarta",
    "Surabaya" : "Jawa Timur",
    "Semarang" : "Jawa Tengah"
}

In [109]:
df['tourism_with_id']['Province'] = df['tourism_with_id']['City'].apply(
    lambda city: city_province_mapping.get(city, "Unknown")
)

In [123]:
place_insert_sql = """
insert into public.place (
  place_id,
  place_name,
  description,
  category,
  province,
  price,
  rating,
  time_minutes,
  latitude,
  longitude
) values
"""

def escape_sql(value):
    if isinstance(value, str):
        return value.replace("'", "''")
    return value
rows = []
for _, row in df['tourism_with_id'].iterrows():
    rows.append(
    f"({row['Place_Id']}, "
    f"'{escape_sql(row['Place_Name'])}', "
    f"'{escape_sql(row['Description'])}', "
    f"'{escape_sql(row['Category'])}', "
    f"'{escape_sql(row['City'])}', "
    f"{row['Price'] if pd.notnull(row['Price']) else 'NULL'}, "
    f"{row['Rating'] if pd.notnull(row['Rating']) else 'NULL'}, "
    f"{row['Time_Minutes'] if pd.notnull(row['Time_Minutes']) else 'NULL'}, "
    f"{row['Lat']}, "
    f"{row['Long']}"
    ")"
)
values_str = ",\n".join(rows)

with open("supabase/seeder/place.sql", "w") as f:
    f.write(place_insert_sql + values_str + ";")