In [1]:
import pymongo
import pandas as pd
import os

In [5]:
def build_collection(df, collection, column_mapping, indices, ind_comp=None):
    """
    Inserts data from a pandas DataFrame into a MongoDB collection based on specified column mapping.

    :param df: pandas DataFrame containing the data to be inserted.
    :param collection: MongoDB collection object where the data will be inserted.
    :param column_mapping: Dictionary mapping the DataFrame column names to MongoDB document field names.
    :param indices: List of field names to create index for.
    :param ind_comp: List of list containing multiple fields to create compound indices.
    """

    # Load documents
    for index, row in df.iterrows():
        # Create a document for each row using the column mapping
        document = {mongo_attr: row[csv_col] for csv_col, mongo_attr in column_mapping.items()}

        # Insert the document into MongoDB
        collection.insert_one(document)

    
    # Create indices
    for field in indices:
        collection.create_index(field)
    
    # Create compound indices
    if ind_comp is not None:
        for combo in ind_comp:
            f_list = [];
            for field in combo:
                f_list.append((field))
            collection.create_index(f_list)


In [6]:
# MongoDB connection
## Create a connection to the MongoDB server
client = pymongo.MongoClient('localhost', 27017)

## Connect to a database (it will be created if it doesn't exist)
db = client['sparkplug']

In [7]:
# Build transactions collection
df_transactions = pd.read_csv('data/transactions.csv')
collection_transactions = db['transactions']

column_mapping_transactions = {
    'Station Name': 'station_name',
    'station_id': 'station_id',
    'Start Date': 'start_date',
    'End Date': 'end_date',
    'Transaction Date (Pacific Time)': 'transaction_date',
    'Total Duration (hh:mm:ss)': 'total_duration',
    'Charging Time (hh:mm:ss)': 'charging_time',
    'Energy (kWh)': 'energy',
    'GHG Savings (kg)': 'ghg_savings',
    'Gasoline Savings (gallons)': 'gas_savings',
    'Port Type': 'charge_level',
    'Port Number': 'port_number',
    'Plug Type': 'plug_type',
    'City': 'city',
    'State/Province': 'state',
    'Postal Code': 'postal_code',
    'Country': 'country',
    'Currency': 'currency',
    'Fee': 'fee',
    'Ended By': 'ended_by',
    'Plug In Event Id': 'plug_in_event_id',
    'User ID': 'user_id',
}

indices_transactions = ['station_id', 'charge_level', 'plug_type', 'postal_code', 'country', 'user_id']
ind_comp_transactions = [
    ['country', 'state', 'city'],
    ['country', 'state']
]

## Run builder
build_collection(df_transactions, collection_transactions, 
                 column_mapping_transactions, indices_transactions, ind_comp_transactions)

In [8]:
# Build stations collection
df_stations = pd.read_csv('data/stations.csv')
collection_stations = db['stations']

column_mapping_stations = {
    'id': 'station_id',
    'price': 'price',
    'site_id': 'site_id',
    'mech_status': 'mech_status',
    'elec_status': 'elec_status',
    'net_status': 'net_status',
    'update_log': 'update_log',
}
# IMPORTANT: need to store update log for each station, to calculate use time and downtime

indices_stations = ['site_id', 'mech_status', 'elec_status', 'net_status']

build_collection(df_stations, collection_stations, 
                 column_mapping_stations, indices_stations)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [10]:
# Cleanup
collection_transactions.drop()