In [7]:
import requests
from pyarrow.table import table_to_blocks

from src.load_data import load_data

# Define the URL
url = "http://127.0.0.1:8080/predict"

data = load_data(local=True)
clean_data = data.dropna(axis=0, how='any')
clean_data = clean_data.iloc[0:10]

# Define the JSON payload
payload = {
    "columns": clean_data.columns.tolist(),
    "data": clean_data.to_numpy().tolist(),
}

# Define headers
headers = {
    "Content-Type": "application/json"
}

# Send the POST request
response = requests.post(url, json=payload, headers=headers)

# Print the response
print("Status Code:", response.status_code)
print("Response Body:", response.json())

Status Code: 200
Response Body: {'predictions': [57.96398162841797, 19.628793716430664, 9.540163040161133, 47.59981918334961, 17.712688446044922, 46.04148483276367, 3.5623881816864014, 57.91485595703125, 11.787161827087402, 9.88700008392334]}


In [16]:
import json
from google.cloud import bigquery
import pandas as pd


def load_data_test(local=True, size=1000):
    if local:
        data = pd.read_csv("../taxi_trips.csv", nrows=10000)
        return data, None
    else:
        # Open and load the JSON file
        try:
            with open("../configs/config.json", "r") as file:
                config = json.load(file)
        except FileNotFoundError:
            raise FileNotFoundError("Configuration file not found at '../configs/config.json'")
        except json.JSONDecodeError:
            raise ValueError("Invalid JSON format in configuration file.")
        # Access specific values
        PROJECT_ID = config["PROJECT_ID"]
        DATASET_NAME = config["DATASET_NAME"]

        # Initialize the BigQuery client
        client = bigquery.Client()

        # Format: `project_id.dataset_id.table_id`
        table_id_old = f"{PROJECT_ID}.{DATASET_NAME}.raw_data"

        # Create a BigQuery query job
        query = f"SELECT * FROM `{table_id_old}`"

        # Run the query and download the results as a pandas DataFrame
        query_job = client.query(query)  # API request
        results_old = query_job.result().to_dataframe()

        table_ref = client.dataset(DATASET_NAME).table("prediction")
        table = client.get_table(table_ref)
        
        print(table_ref)
        columns = [field.name for field in table.schema if field.name not in ["prediction", "timestamp"]]
        print(f"columns", columns)
        query = f"SELECT {', '.join([f'`{column}`' for column in columns])} FROM {table_ref}"        
        print(query)
        query_job = client.query(query)
        results_new = query_job.result().to_dataframe()

        frames = [results_old, results_new]
        result = pd.concat(frames)
        result = result.tail(size)
        return result, results_new

In [17]:
df, new = load_data_test(local=False)

carbon-relic-439014-t0.chicago_taxi.prediction
columns ['Trip ID', 'Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp', 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract', 'Pickup Community Area', 'Dropoff Community Area', 'Fare', 'Tips', 'Tolls', 'Extras', 'Trip Total', 'Payment Type', 'Company', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude', 'Pickup Centroid Location', 'Dropoff Centroid Latitude', 'Dropoff Centroid Longitude', 'Dropoff Centroid Location']
SELECT `Trip ID`, `Taxi ID`, `Trip Start Timestamp`, `Trip End Timestamp`, `Trip Seconds`, `Trip Miles`, `Pickup Census Tract`, `Dropoff Census Tract`, `Pickup Community Area`, `Dropoff Community Area`, `Fare`, `Tips`, `Tolls`, `Extras`, `Trip Total`, `Payment Type`, `Company`, `Pickup Centroid Latitude`, `Pickup Centroid Longitude`, `Pickup Centroid Location`, `Dropoff Centroid Latitude`, `Dropoff Centroid Longitude`, `Dropoff Centroid Location` FROM carbon-relic-439014-t0.chicago_taxi.predict

In [20]:
df.shape

(1000, 24)

In [26]:
from google.cloud import bigquery
import pandas as pd
import json
import logging
from pathlib import Path

logging.basicConfig(level=logging.INFO)

def execute_bigquery_query(client, query):
    """Executes a BigQuery query and returns the results as a DataFrame."""
    try:
        query_job = client.query(query)  # API request
        return query_job.result().to_dataframe()
    except Exception as e:
        logging.error(f"Error executing query: {query}")
        raise e

def load_data_test(local=True, size=1000, use_new_data=True):
    """
    Load taxi trip data from a local CSV or BigQuery.

    Args:
        local (bool): If True, load data from a local CSV file; otherwise, fetch from BigQuery.
        size (int): Number of rows to return from the concatenated DataFrame.

    Returns:
        tuple: A tuple containing the resulting DataFrame and the 'new' DataFrame if applicable.
    """
    if local:
        try:
            data = pd.read_csv("../taxi_trips.csv", nrows=10000)
            return data
        except FileNotFoundError:
            raise FileNotFoundError("Local CSV file not found at '../taxi_trips.csv'")

    else:
        try:
            # Load config
            config_path = Path("../configs/config.json")
            if not config_path.is_file():
                raise FileNotFoundError("Configuration file not found.")
            
            with open(config_path, "r") as file:
                config = json.load(file)

            PROJECT_ID = config["PROJECT_ID"]
            DATASET_NAME = config["DATASET_NAME"]

            # Initialize BigQuery client
            client = bigquery.Client()

            # Query old data
            table_id_old = f"{PROJECT_ID}.{DATASET_NAME}.raw_data"
            query_old = f"SELECT * FROM `{table_id_old}`"
            result_old = execute_bigquery_query(client, query_old)
            
            # Only return old training data 
            if not use_new_data:
                return result_old.tail(size)
            
            # Query new data
            table_ref = client.dataset(DATASET_NAME).table("prediction")
            table = client.get_table(table_ref)
            columns = [field.name for field in table.schema if field.name not in ["prediction", "timestamp"]]
            query_new = f"SELECT {', '.join([f'`{column}`' for column in columns])} FROM `{PROJECT_ID}.{DATASET_NAME}.prediction`"
            results_new = execute_bigquery_query(client, query_new)

            # Concatenate and limit size
            frames = [result_old, results_new]
            result = pd.concat(frames).tail(size)
            return result

        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise e

In [32]:
df = load_data_test(local=False, size=1000, use_new_data=True)


In [33]:
df.shape

(1000, 24)