## Setup

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import random
import mysql.connector
from mysql.connector import Error
import random

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get the connection parameters from environment variables
host = os.getenv("DB_HOST")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
database = os.getenv("DB_DATABASE")

# Create a connection to the MySQL database
connection = mysql.connector.connect(
    host=host,
    user=user,
    password=password,
    database=database
)

In [3]:
# Read the CSV file into a dataframe
file_path = 'data/g6stations.csv'
df = pd.read_csv(file_path)

# Handle missing values (NaN) in the dataframe
df = df.fillna('')

# Sort the dataframe by the "ID" column
df = df.sort_values(by='ID')

# Print the dataframe
print(df.head())

                         Station Name           Street Address         City  \
33723   Los Angeles Convention Center       1201 S Figueroa St  Los Angeles   
36593  California Air Resources Board         9530 Telstar Ave     El Monte   
37097          Scripps Green Hospital  10666 N Torrey Pines Rd     La Jolla   
35965                   Galpin Motors        15421 Roscoe Blvd    Sepulveda   
38171               Galleria at Tyler   1299 Galleria at Tyler    Riverside   

      State    ZIP EV Level1 EVSE Num EV Level2 EVSE Num EV DC Fast Count  \
33723    CA  90015                                   7.0                    
36593    CA  91731                                   3.0                    
37097    CA  92037                                   1.0                    
35965    CA  91343                                   2.0                    
38171    CA  92503                                   4.0                    

       lvl 1 present  lvl 2 present  ...   Longitude    ID  \


## Data Exploration

In [7]:
# Assuming your dataframe is named df and the relevant columns are 'Street Address', 'City', and 'State'
grouped = df.groupby(['Street Address', 'City', 'State']).size()

# DataFrame with counts
counts_df = grouped.reset_index(name='Count')

# Addresses with only one entry
single_entry_addresses = counts_df[counts_df['Count'] == 1]

# Addresses with multiple entries
multiple_entries_addresses = counts_df[counts_df['Count'] > 1]

# Display the results
print(f"Number of addresses with a single entry: {len(single_entry_addresses)}")
print(f"Number of addresses with multiple entries: {len(multiple_entries_addresses)}")

# Optionally, display the actual addresses
print(single_entry_addresses)
sorted_multiple_entries = multiple_entries_addresses.sort_values('Count', ascending=False)
print(sorted_multiple_entries)


Number of addresses with a single entry: 31080
Number of addresses with multiple entries: 7579
                     Street Address               City State  Count
0          #1 Business Loop 70 East           Columbia    MO      1
1            #42 Montana Highway 84             Norris    MT      1
2                            #NAME?  Carmel-By-The-Sea    CA      1
3      - Skyland Upper Loop Mile 41              Luray    VA      1
4                       0 Asbury St           Hamilton    MA      1
...                             ...                ...   ...    ...
38653       York St & Thornton Pkwy           Thornton    CO      1
38654         York St Thornton pkwy           Thornton    CO      1
38655                   Yosemite Rd             Fresno    CA      1
38657               atlas cedar way         Sacramento    CA      1
38658                   highway 190       Death Valley    CA      1

[31080 rows x 4 columns]
                              Street Address           City Sta

In [5]:
# Specify the file path and name for the CSV file
output_file_path = 'sorted_multiple_entries.csv'

# Export the DataFrame to a CSV file
sorted_multiple_entries.to_csv(output_file_path, index=False)

print(f"Data exported to '{output_file_path}' successfully.")


Data exported to 'sorted_multiple_entries.csv' successfully.


## Approach 1: Each row is a site, generate stations
### Previous approach, do not run

In [6]:
"""
def calculate_station_coordinates(base_lat, base_lon, station_index, total_stations):
    # Calculate row and column in a square grid
    grid_size = int(total_stations**0.5) + 1
    row = (station_index - 1) // grid_size
    col = (station_index - 1) % grid_size

    # Calculate the offset
    offset = 0.000022
    new_lat = base_lat + (offset * row) - (offset * grid_size / 2)
    new_lon = base_lon + (offset * col) - (offset * grid_size / 2)

    return new_lat, new_lon

statuses = ['Available', 'Occupied', 'Unavailable', 'Faulted', 'Offline']
weights = [0.6, 0.32, 0.04, 0.02, 0.02]

try:
    # Create a cursor object
    cursor = connection.cursor()

    # Iterate through each row of the dataframe
    for index, row in df.iterrows():
        # Generate a random owner_id between 1 and 10
        owner_id = random.randint(1, 10)
        
        # Extract the values from the row
        ID = row['ID']
        Latitude = row['Latitude']
        Longitude = row['Longitude']
        Station_Name = row['Station Name']
        Street_Address = row['Street Address']
        ZIP = row['ZIP']
        City = row['City']
        State = row['State']
        
        # Insert the values into the Site table
        query = "INSERT INTO Site (id, owner_id, latitude, longitude, name, street_address, zip_code, city, state) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
        values = (ID, owner_id, Latitude, Longitude, Station_Name, Street_Address, ZIP, City, State)
        
        try:
            cursor.execute(query, values)
        except Error as e:
            print(f"Error: {e}")
        
        # Commit in batches
        if index % 100 == 0:
            connection.commit()

        # Calculate the total number of stations for this site
        level1_count = int(row['EV Level1 EVSE Num'] or 0)
        level2_count = int(row['EV Level2 EVSE Num'] or 0)
        dc_fast_count = int(row['EV DC Fast Count'] or 0)
        total_stations = level1_count + level2_count + dc_fast_count

        # Connector types mapping based on charge level
        connector_types_map = {
            1: ['J1772', 'NEMA515', 'NEMA520'],
            2: ['J1772', 'TESLA'],
            3: ['CHADEMO', 'J1772COMBO', 'TESLA']
        }

        # Current site connector types
        site_connector_types = row['EV Connector Types'].split()

        # Initialize station index
        station_index = 1

        # Assign a random status
        status = random.choices(statuses, weights, k=1)[0]

        # Generate stations
        for level, count in [(1, level1_count), (2, level2_count), (3, dc_fast_count)]:
            for _ in range(count):

                # Assign a random status
                status = random.choices(statuses, weights, k=1)[0]
                
                # Station details
                name = f"{row['Station Name']} {station_index}"
                charge_level = level
                connector_types = ' '.join([ct for ct in connector_types_map[level] if ct in site_connector_types])
                latitude, longitude = calculate_station_coordinates(row['Latitude'], row['Longitude'], station_index, total_stations)
                site_id = row['ID']

                # Insert station into the database
                station_query = "INSERT INTO Station (name, charge_level, connector_type, latitude, longitude, site_id, status) VALUES (%s, %s, %s, %s, %s, %s, %s)"
                station_values = (name, charge_level, connector_types, latitude, longitude, site_id, status)
                cursor.execute(station_query, station_values)

                # Increment station index
                station_index += 1

        # Commit after handling each site
        connection.commit()

    # Final commit for remaining rows
    connection.commit()
finally:
    # Close cursor and connection
    cursor.close()
    connection.close()
"""

'\ndef calculate_station_coordinates(base_lat, base_lon, station_index, total_stations):\n    # Calculate row and column in a square grid\n    grid_size = int(total_stations**0.5) + 1\n    row = (station_index - 1) // grid_size\n    col = (station_index - 1) % grid_size\n\n    # Calculate the offset\n    offset = 0.000022\n    new_lat = base_lat + (offset * row) - (offset * grid_size / 2)\n    new_lon = base_lon + (offset * col) - (offset * grid_size / 2)\n\n    return new_lat, new_lon\n\nstatuses = [\'Available\', \'Occupied\', \'Unavailable\', \'Faulted\', \'Offline\']\nweights = [0.6, 0.32, 0.04, 0.02, 0.02]\n\ntry:\n    # Create a cursor object\n    cursor = connection.cursor()\n\n    # Iterate through each row of the dataframe\n    for index, row in df.iterrows():\n        # Generate a random owner_id between 1 and 10\n        owner_id = random.randint(1, 10)\n        \n        # Extract the values from the row\n        ID = row[\'ID\']\n        Latitude = row[\'Latitude\']\n     

## Approach 2: Each row is a station, generate sites

In [4]:
def common_name(names):
    split_names = [name.lower().split() for name in names]
    common_prefix = os.path.commonprefix(split_names)
    return ' '.join(common_prefix).title()

# Extract unique site combinations and compute average latitude and longitude
unique_sites_df = df.groupby(['Street Address', 'City', 'State', 'ZIP']).agg({
    'Latitude': 'mean',
    'Longitude': 'mean',
    'Station Name': common_name
}).reset_index()
unique_sites_df.rename(columns={'Station Name': 'Name'}, inplace=True)

In [5]:
# Add sites to Site table and retrieve their IDs
try:
    cursor = connection.cursor()

    # Initialize the site_ids dictionary
    site_ids = {}

    # Add sites to the Site table
    for _, site in unique_sites_df.iterrows():
        owner_id = random.randint(1, 10)  # Random owner_id for each site

        site_query = "INSERT INTO Site (owner_id, name, street_address, city, state, zip_code, latitude, longitude) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        site_values = (owner_id, site['Name'], site['Street Address'], site['City'], site['State'], site['ZIP'], site['Latitude'], site['Longitude'])
        cursor.execute(site_query, site_values)
        connection.commit()

        site_id = cursor.lastrowid
        site_key = f"{site['Street Address']}_{site['City']}_{site['State']}"
        site_ids[site_key] = site_id

except Error as e:
    print(f"Error: {e}")
finally:
    cursor.close()


In [6]:
statuses = ['Available', 'Occupied', 'Unavailable', 'Faulted', 'Offline']
weights = [0.6, 0.32, 0.04, 0.02, 0.02]

try:
    cursor = connection.cursor()

    # Initialize a counter variable
    counter = 0

    for index, row in df.iterrows():
        Latitude = row['Latitude']
        Longitude = row['Longitude']
        Station_Name = row['Station Name']
        Connector_Type = row['EV Connector Types']
    
        # Use the correct column names, with spaces
        site_key = f"{row['Street Address']}_{row['City']}_{row['State']}"
        site_id = site_ids.get(site_key)
    
        # Assign a random status
        status = random.choices(statuses, weights, k=1)[0]
    
        # Convert EVSE Num values to integers, handling non-numeric cases
        try:
            ev_level1 = int(row['EV Level1 EVSE Num'])
        except ValueError:
            ev_level1 = 0

        try:
            ev_level2 = int(row['EV Level2 EVSE Num'])
        except ValueError:
            ev_level2 = 0

        try:
            ev_dc_fast = int(row['EV DC Fast Count'])
        except ValueError:
            ev_dc_fast = 0

        # Determine charge_level
        if ev_level1 > 0:
            charge_level = 1
        elif ev_level2 > 0:
            charge_level = 2
        elif ev_dc_fast > 0:
            charge_level = 3
        else:
            charge_level = 0
    
        # Insert station into the database
        station_query = "INSERT INTO Station (latitude, longitude, name, site_id, status, charge_level, connector_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
        station_values = (Latitude, Longitude, Station_Name, site_id, status, charge_level, Connector_Type)
        cursor.execute(station_query, station_values)
    
        counter += 1
        if counter % 100 == 0:
            connection.commit()
    
    # Commit any remaining rows
    connection.commit()

except Error as e:
    print(f"Error: {e}")
finally:
    cursor.close()
    connection.close()

In [7]:
print(df.columns)


Index(['Station Name', 'Street Address', 'City', 'State', 'ZIP',
       'EV Level1 EVSE Num', 'EV Level2 EVSE Num', 'EV DC Fast Count',
       'lvl 1 present', 'lvl 2 present', 'lvl 3 present', 'EV Network',
       'Latitude', 'Longitude', 'ID', 'Updated At', 'Owner Type Code',
       'EV Connector Types', 'Facility Type', 'EV Pricing',
       'EV On-Site Renewable Source', 'Restricted Access',
       'Maximum Vehicle Class'],
      dtype='object')
