# Create data dummy

In [None]:
from faker import Faker
import random
import pandas as pd

# Konfigurasi Faker
fake = Faker('id_ID')

# Jumlah data yang akan di-generate
num_records = 150

# Function untuk generate data berdasarkan skema tabel
def generate_data(num_records, table_name):
    data = []
    if table_name == 'reports':
        report_types = ['rubbish', 'littering']
        waste_types = {
            'rubbish': ['sampah basah', 'sampah kering', 'sampah basah, sampah kering'],
            'littering': ['organik', 'anorganik', 'berbahaya']
        }
        status_types = ['need review', 'approve', 'reject']
        
        for _ in range(num_records):
            report_type = random.choice(report_types)
            waste_type = random.choice(waste_types[report_type])
            
            data.append({
                'id': fake.uuid4(),
                'report_type': report_type,
                'waste_type': waste_type,
                'title': fake.sentence(nb_words=6),
                'description': fake.text(),
                'status': random.choice(status_types),
                'reason': fake.text(),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time(),
                'deleted_at': fake.date_time_this_year(before_now=True, after_now=False)
            })
    elif table_name == 'location_reports':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'longtitude': fake.longitude(),
                'langtitude': fake.latitude(),
                'address': fake.address(),
                'city': fake.city(),
                'province': fake.state()
            })
    elif table_name == 'users':
        genders = ['laki-laki', 'perempuan']
        badges = ['bronze', 'silver', 'gold', 'platinum']
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'name': fake.name(),
                'email': fake.email(),
                'phone_number': fake.random_number(digits=16, fix_len=True),
                'point': fake.random_int(min=0, max=1000),
                'gender': random.choice(genders),
                'birth_date': fake.date_of_birth(),
                'address': fake.address(),
                'badge': random.choice(badges),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time(),
                'deleted_at': fake.date_time_this_year(before_now=True, after_now=False)
            })
    elif table_name == 'waste_materials':
        material_types = ['plastik', 'kaca', 'kayu', 'kertas', 'baterai', 'besi', 'limbah berbahaya', 'limbah beracun', 'sisa makanan', 'tak terdeteksi']
        for _ in range(num_records):
            data.append({
                'id': fake.uuid4(),
                'type': random.choice(material_types),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time(),
                'deleted_at': fake.date_time_this_year(before_now=True, after_now=False)
            })
    elif table_name == 'task_users_challange':
        status_types = ['need review', 'approve', 'reject']
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'status': random.choice(status_types),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time()
            })
    elif table_name == 'task_challange':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'title': fake.sentence(nb_words=6),
                'description': fake.text(max_nb_chars=255),
                'point': fake.random_int(min=0, max=1000),
                'start_date': fake.date_time(),
                'end_date': fake.date_time(),
                'status': fake.boolean(),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time()
            })
    elif table_name == 'task_steps':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'title': fake.sentence(nb_words=6),
                'description': fake.text(max_nb_chars=255),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time()
            })
    elif table_name == 'admins':
        roles = ['admin', 'super_admin']
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'name': fake.name(),
                'role': random.choice(roles),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time(),
                'deleted_at': fake.date_time_this_year(before_now=True, after_now=False)
            })
    elif table_name == 'contents':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'title_content': fake.sentence(nb_words=6),
                'description': fake.text(),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time()
            })
    elif table_name == 'categories':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'name': fake.word(),
                'created_at': fake.date_time(),
                'updated_at': fake.date_time()
            })
    return data

# Function untuk generate data untuk tabel fakta
def generate_fact_data(num_records, fact_table_name, dimensions):
    data = []
    if fact_table_name == 'fact_reporting':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'users_id': random.choice(dimensions['users']['id'].values),
                'reports_id': random.choice(dimensions['reports']['id'].values),
                'waste_materials_id': random.choice(dimensions['waste_materials']['id'].values),
                'location_reports_id': random.choice(dimensions['location_reports']['id'].values),
                'count_reporting': fake.random_int(min=1, max=100)
            })
    elif fact_table_name == 'fact_challange':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'users_id': random.choice(dimensions['users']['id'].values),
                'task_users_challange_id': random.choice(dimensions['task_users_challange']['id'].values),
                'task_steps_id': random.choice(dimensions['task_steps']['id'].values),
                'task_challange_id': random.choice(dimensions['task_challange']['id'].values),
                'admins_id': random.choice(dimensions['admins']['id'].values),
                'count_challange': fake.random_int(min=1, max=100)
            })
    elif fact_table_name == 'fact_conten':
        for _ in range(num_records):
            data.append({
                'id': fake.random_int(min=1, max=10000),
                'contents_id': random.choice(dimensions['contents']['id'].values),
                'categories_id': random.choice(dimensions['categories']['id'].values),
                'admins_id': random.choice(dimensions['admins']['id'].values),
                'count_conten': fake.random_int(min=1, max=100)
            })
    return data

# Generate dan simpan data ke CSV untuk setiap tabel dimensi
tables = [
    'reports', 'location_reports', 'users', 'waste_materials', 'task_users_challange',
    'task_challange', 'task_steps', 'admins', 'contents', 'categories'
]

for table in tables:
    df = pd.DataFrame(generate_data(num_records, table))
    df.to_csv(f'data_dummy/{table}.csv', index=False)

# Baca data dimensi dari file CSV yang sudah dibuat sebelumnya
dimensions = {}
for table in tables:
    dimensions[table] = pd.read_csv(f'data_dummy/{table}.csv')

# Generate dan simpan data ke CSV untuk tabel fakta
fact_tables = ['fact_reporting', 'fact_challange', 'fact_conten']
for fact_table in fact_tables:
    df = pd.DataFrame(generate_fact_data(num_records, fact_table, dimensions))
    df.to_csv(f'data_dummy/{fact_table}.csv', index=False)
print("Data successfully created")

# Load to Big Query

In [None]:
import os
from dotenv import load_dotenv
from google.cloud import bigquery

# Load environment variables from .env file
load_dotenv()

# Path to the JSON key file
key_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')

# Ensure key_path is correctly loaded
if not key_path:
    raise ValueError("GOOGLE_APPLICATION_CREDENTIALS key path not found. Please check your .env file and environment variables.")

# Set the Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_path

# BigQuery client
client = bigquery.Client()

# Define the existing dataset name
dataset_name = 'data_recything'

# Folder containing the CSV files
data_folder = 'data_dummy'

# List of CSV files
csv_files = [
    'admins.csv', 'categories.csv', 'contents.csv', 'fact_challange.csv',
    'fact_conten.csv', 'fact_reporting.csv', 'location_reports.csv', 'reports.csv',
    'task_challange.csv', 'task_steps.csv', 'task_users_challange.csv', 'users.csv',
    'waste_materials.csv'
]

# Function to load CSV file into BigQuery
def load_csv_to_bigquery(file_name):
    file_path = os.path.join(data_folder, file_name)
    table_name = file_name.replace('.csv', '')
    
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Define the table ID
    table_id = f"{client.project}.{dataset_name}.{table_name}"
    
    # Load DataFrame to BigQuery table
    job = client.load_table_from_dataframe(df, table_id)
    job.result()  # Wait for the job to complete
    
    print(f"Loaded {file_name} into {table_id}")

# Load each CSV file into BigQuery
for csv_file in csv_files:
    load_csv_to_bigquery(csv_file)
print("Data successfully uploaded")