# Data Generation

In [10]:
import pandas as pd
from datetime import datetime, timedelta
import mysql.connector

# Create connection
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="1234567890",
    port=3306,
    database="pilot"
)

cursor = conn.cursor()


In [2]:
# Create a DataFrame with user data
data = [
    {
        "firstname": "Alice",
        "lastname": "Smith",
        "phone_number": "123-456-7890",
        "email": "alice.smith@example.com",
        "card_id": "CARD001",
        "created_date": datetime(2024, 6, 1)
    },
    {
        "firstname": "Bob",
        "lastname": "Johnson",
        "phone_number": "234-567-8901",
        "email": "bob.johnson@example.com",
        "card_id": "CARD002",
        "created_date": datetime(2024, 6, 2)
    },
    {
        
        "firstname": "Charlie",
        "lastname": "Williams",
        "phone_number": "345-678-9012",
        "email": "charlie.williams@example.com",
        "card_id": "CARD003",
        "created_date": datetime(2024, 6, 3)
    }
]

df_users = pd.DataFrame(data)
print(df_users)

  firstname  lastname  phone_number                         email  card_id  \
0     Alice     Smith  123-456-7890       alice.smith@example.com  CARD001   
1       Bob   Johnson  234-567-8901       bob.johnson@example.com  CARD002   
2   Charlie  Williams  345-678-9012  charlie.williams@example.com  CARD003   

  created_date  
0   2024-06-01  
1   2024-06-02  
2   2024-06-03  


In [3]:

# Create table if not exists
cursor.execute("""
    CREATE TABLE IF NOT EXISTS users (
        id INT AUTO_INCREMENT PRIMARY KEY,
        firstname VARCHAR(255),
        lastname VARCHAR(255),
        phone_number VARCHAR(255),
        email VARCHAR(255),
        card_id VARCHAR(255),
        created_date DATETIME
    )
""")

# Insert data
insert_query = """
    INSERT INTO users (firstname, lastname, phone_number, email, card_id, created_date)
    VALUES (%s, %s, %s, %s, %s, %s)
"""

# Convert DataFrame to list of tuples for insertion
values = df_users.values.tolist()

# --- User insert code (do NOT close connection here) ---
cursor.executemany(insert_query, values)
conn.commit()
# Do NOT close here


In [4]:
from pathlib import Path

# List all files in rawdata folder
file_list = list(Path('rawdata').glob('*_data*'))

# Create an empty list to store dataframes
dfs = []

# Read each file and add filename as a column
for file in file_list:
    # Extract the filename part before '_data'
    source = file.stem.split('_data')[0]
    
    # Read the file
    temp_df = pd.read_csv(file)
    
    # Add source column
    temp_df['source'] = source
    
    # Append to list
    dfs.append(temp_df)

# Combine all dataframes
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    print(combined_df.head())
else:
    print("No files found in rawdata folder")


                        Date       Open       High        Low      Close  \
0  2020-01-02 00:00:00-05:00  45.530533  46.169069  45.530533  46.159817   
1  2020-01-03 00:00:00-05:00  45.160365  45.604565  45.160365  45.373211   
2  2020-01-06 00:00:00-05:00  44.725428  45.160372  44.697663  45.132610   
3  2020-01-07 00:00:00-05:00  45.845185  46.252368  45.521287  46.067284   
4  2020-01-08 00:00:00-05:00  45.539786  46.261614  45.512025  45.946972   

    Volume  Dividends  Stock Splits   source  
0  4293200        0.0           0.0  tencent  
1  2728500        0.0           0.0  tencent  
2  4712100        0.0           0.0  tencent  
3  4667000        0.0           0.0  tencent  
4  2925200        0.0           0.0  tencent  


In [5]:
# Create stocks table if not exists
cursor.execute("""
    CREATE TABLE IF NOT EXISTS stocks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        date DATETIME,
        open FLOAT,
        high FLOAT,
        low FLOAT,
        close FLOAT,
        volume BIGINT,
        dividends FLOAT,
        stock_splits FLOAT,
        source VARCHAR(255)
    )
""")

# Insert data
insert_query = """
    INSERT INTO stocks (date, open, high, low, close, volume, dividends, stock_splits, source)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Convert DataFrame to list of tuples for insertion
values = combined_df.values.tolist()

cursor.executemany(insert_query, values)
conn.commit()

## Bridge Tables

In [6]:
import random

# Create collections table that links users and stocks
cursor.execute("""
    CREATE TABLE IF NOT EXISTS collections (
        id INT AUTO_INCREMENT PRIMARY KEY,
        user_id INT,
        stock_id INT,
        created_date DATETIME DEFAULT CURRENT_TIMESTAMP,
        FOREIGN KEY (user_id) REFERENCES users(id),
        FOREIGN KEY (stock_id) REFERENCES stocks(id)
    )
""")

conn.commit()
# Get all user IDs
cursor.execute("SELECT id FROM users")
user_ids = [row[0] for row in cursor.fetchall()]

# Get all stock IDs
cursor.execute("SELECT id FROM stocks")
stock_ids = [row[0] for row in cursor.fetchall()]

# Generate 100 random connections
collection_data = []
for _ in range(100):
    user_id = random.choice(user_ids)
    stock_id = random.choice(stock_ids)
    collection_data.append((user_id, stock_id))

# Insert the records
insert_query = """
    INSERT INTO collections (user_id, stock_id)
    VALUES (%s, %s)
"""
cursor.executemany(insert_query, collection_data)
conn.commit()

In [11]:
from random import randint
# Create trades table if not exists
cursor.execute("""
    CREATE TABLE IF NOT EXISTS trades (
        id INT AUTO_INCREMENT PRIMARY KEY,
        user_id INT,
        stock_id INT,
        action VARCHAR(4),
        price FLOAT,
        quantity INT,
        total_amount FLOAT,
        created_date DATETIME,
        FOREIGN KEY (user_id) REFERENCES users(id),
        FOREIGN KEY (stock_id) REFERENCES stocks(id)
    )
""")

# Get stock prices for trades
cursor.execute("SELECT id, date, close FROM stocks")
stock_prices = {}
for id, date, close in cursor.fetchall():
    stock_prices[id] = {'date': date, 'price': close}

trade_data = []
for _ in range(500):
    user_id = random.choice(user_ids)
    stock_id = random.choice(stock_ids)
    action = random.choice(['buy', 'sell'])
    date = datetime(2024, 6, 1) + timedelta(days=randint(0, 365))
    
    # Find the closest price to the trade date
    price = stock_prices[stock_id]['price']
    quantity = random.randint(1, 100)
    total_amount = price * quantity
    
    trade_data.append((user_id, stock_id, action, price, quantity, total_amount, date))

# Insert trades
insert_query = """
    INSERT INTO trades (user_id, stock_id, action, price, quantity, total_amount, created_date)
    VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
cursor.executemany(insert_query, trade_data)
conn.commit()
