<a href="https://colab.research.google.com/github/SarveshKanneganti/Crypto-Analytics---PSQL/blob/main/Synthetic_Data_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
  pip install pandas numpy faker

Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.8.0


In [4]:

"""
generate_crypto_data.py
Generates synthetic CSV datasets:
 - members_fresh.csv
 - prices_fresh.csv
 - transactions_fresh.csv

Usage:
  pip install pandas numpy faker
  python generate_crypto_data.py
"""

import random
import csv
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from faker import Faker
import os

# -----------------------
# Configuration
# -----------------------
OUT_DIR = os.path.abspath(".")          # output folder (current dir)
N_MEMBERS = 5000
SYMBOLS = ["BTC", "ETH", "XRP", "LTC", "ADA"]
PRICES_START = "2023-01-01"
PRICES_END = "2023-12-31"
TX_COUNT = 20000
REGIONS = ["North America", "Europe", "Asia", "South America", "Africa", "Oceania", "Middle East"]

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
fake = Faker()
Faker.seed(SEED)

# -----------------------
# 1) Members
# -----------------------
def generate_members(n=N_MEMBERS):
    rows = []
    for i in range(1, n+1):
        name = fake.name()
        region = random.choice(REGIONS)
        rows.append({"member_id": i, "name": name, "region": region})
    return pd.DataFrame(rows)

# -----------------------
# 2) Prices (daily OHLCV)
# -----------------------
def generate_prices(symbols=SYMBOLS, start=PRICES_START, end=PRICES_END):
    start_dt = pd.to_datetime(start)
    end_dt = pd.to_datetime(end)
    days = pd.date_range(start_dt, end_dt, freq="D")
    rows = []
    # generate a random starting price per symbol
    for symbol in symbols:
        # choose a plausible starting price (small coins start lower)
        base = random.uniform(0.5, 40000)
        price = base
        for d in days:
            # simulate open/high/low/close with small daily volatility
            open_p = price * random.uniform(0.985, 1.015)
            high_p = open_p * random.uniform(1.0, 1.08)
            low_p = open_p * random.uniform(0.92, 1.0)
            close_p = random.uniform(low_p, high_p)
            volume = random.uniform(1000, 2_000_000)
            rows.append({
                "timestamp": d.strftime("%Y-%m-%d"),
                "symbol": symbol,
                "open_price": round(open_p, 2),
                "high_price": round(high_p, 2),
                "low_price": round(low_p, 2),
                "close_price": round(close_p, 2),
                "volume": round(volume, 2)
            })
            price = close_p  # next day's base
    df = pd.DataFrame(rows)
    # Shuffle rows for more realistic loading order (optional)
    df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return df

# -----------------------
# 3) Transactions
# -----------------------
def generate_transactions(prices_df, members_df, n=TX_COUNT):
    # convert price times to a list we can sample; embed timestamp with random minute in the day
    price_records = prices_df.to_dict("records")
    member_ids = members_df["member_id"].tolist()
    tx_rows = []
    for tx_id in range(1, n+1):
        # pick a random price row (gives symbol and close price and date)
        pr = random.choice(price_records)
        symbol = pr["symbol"]
        # make a timestamp by adding random minutes to the date's midnight
        base_date = datetime.strptime(pr["timestamp"], "%Y-%m-%d")
        tx_ts = base_date + timedelta(minutes=random.randint(0, 23*60+59))
        member_id = random.choice(member_ids)
        side = "BUY" if random.random() < 0.55 else "SELL"
        quantity = round(random.uniform(0.01, 5.0), 6)
        price = round(pr["close_price"], 2)
        fee = round(price * quantity * random.uniform(0.0005, 0.002), 6)
        tx_rows.append({
            "tx_id": tx_id,
            "member_id": member_id,
            "symbol": symbol,
            "tx_timestamp": tx_ts.strftime("%Y-%m-%d %H:%M:%S"),
            "side": side,
            "quantity": quantity,
            "price": price,
            "fee": fee
        })
    return pd.DataFrame(tx_rows)

# -----------------------
# Run generation and save CSVs
# -----------------------
def main():
    print("Generating members...")
    members_df = generate_members()
    members_csv = os.path.join(OUT_DIR, "members_fresh.csv")
    members_df.to_csv(members_csv, index=False)
    print(f"Saved members -> {members_csv}  ({len(members_df)} rows)")

    print("Generating prices (OHLCV)...")
    prices_df = generate_prices()
    prices_csv = os.path.join(OUT_DIR, "prices_fresh.csv")
    prices_df.to_csv(prices_csv, index=False)
    print(f"Saved prices  -> {prices_csv}  ({len(prices_df)} rows)")

    print("Generating transactions...")
    tx_df = generate_transactions(prices_df, members_df)
    transactions_csv = os.path.join(OUT_DIR, "transactions_fresh.csv")
    tx_df.to_csv(transactions_csv, index=False)
    print(f"Saved transactions -> {transactions_csv}  ({len(tx_df)} rows)")

    print("\nDone. Files:")
    print(f" - {members_csv}")
    print(f" - {prices_csv}")
    print(f" - {transactions_csv}")

if __name__ == "__main__":
    main()


Generating members...
Saved members -> /content/members_fresh.csv  (5000 rows)
Generating prices (OHLCV)...
Saved prices  -> /content/prices_fresh.csv  (1825 rows)
Generating transactions...
Saved transactions -> /content/transactions_fresh.csv  (20000 rows)

Done. Files:
 - /content/members_fresh.csv
 - /content/prices_fresh.csv
 - /content/transactions_fresh.csv


In [None]:
from google.colab import files

files.download('/content/members_fresh.csv')
files.download('/content/prices_fresh.csv')
files.download('/content/transactions_fresh.csv')