In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time
from geopy.geocoders import Nominatim, ArcGIS
import re
from shapely.geometry import Point
import geopandas as gpd
from tqdm.auto import tqdm

## 1. Loading data + manipulation

In [6]:
# Define the path to the raw data file
raw_datacenters_path = Path.cwd().parent.parent / 'data' / 'raw' / 'brasil' / 'datacenter_map_scraped_brasil.csv'

# Load the raw data into a DataFrame
raw_datacenters = pd.read_csv(raw_datacenters_path)

# Drop the unnamed column
raw_datacenters = raw_datacenters.drop(columns='Unnamed: 0')

# Extract the provider name from the 'name' column using regex
provider_regex = r'(.*):'
raw_datacenters['provider'] = raw_datacenters['name'].str.extract(provider_regex)

## 2. Defining the unscrambling functions

In [14]:
def parse_field_value(value):
    """
    Parse a string in the format "{number} {category}" into a tuple (number, category).
    
    Args:
        value (str): The string to parse, e.g., "10 Servers" or "5.5 MW".
    
    Returns:
        tuple: A tuple containing (number, category). If the value is NaN or doesn't match the format,
               returns (None, None).
    """
    if pd.isna(value):
        return None, None
    
    # Remove leading/trailing whitespace
    value = value.strip()
    
    # Use regex to extract the number and category
    match = re.match(r'^\s*([\d.]+)\s+(.+)$', value)
    if match:
        number_str, category = match.groups()
        # Convert the number to float or int based on its format
        try:
            number = float(number_str) if '.' in number_str else int(number_str)
        except ValueError:
            number = number_str  # Fallback to string if conversion fails
        return number, category.strip()
    return None, None

def extract_fields_from_row(row):
    """
    Extract structured data from field columns in a row and return a Series of key-value pairs.
    
    Args:
        row (pd.Series): A row from the DataFrame.
    
    Returns:
        pd.Series: A Series where keys are categories and values are numbers.
    """
    extracted_fields = {}
    for col in field_columns:
        value = row[col]
        number, category = parse_field_value(value)
        if category is not None:
            # Keep the first occurrence of each category
            if category not in extracted_fields:
                extracted_fields[category] = number
    return pd.Series(extracted_fields)

## 2. Applying the unscrambling functions

In [17]:
# Identify columns that start with 'field' (these contain structured data)
field_columns = [col for col in raw_datacenters.columns if col.startswith('field')]

# Apply the extraction function to each row
extracted_fields = raw_datacenters.apply(extract_fields_from_row, axis=1)

# Merge the extracted fields back into the original DataFrame
clean_datacenters = pd.concat(
    [raw_datacenters.drop(columns=field_columns), extracted_fields],
    axis=1
)

## 3. Collapsing MW columns into a single one + adjusting column names

In [21]:
# First, convert both columns to numeric, coercing any non‐numeric values to NaN.
clean_datacenters["MW total power"] = pd.to_numeric(clean_datacenters["MW total power"], errors="coerce")

# Dropping colocation products column, which contains terrible data
clean_datacenters.drop(columns=["colocation products"], inplace=True)

# Standardize column names by replacing spaces with underscores
clean_datacenters.columns = [colname.replace(' ', '_') for colname in clean_datacenters.columns]

In [36]:
# Using a regex to extract zip codes in the format 12345-678
clean_datacenters['zipcode'] = clean_datacenters['address'].str.extract(r'(\d{5}-\d{3})')


In [56]:
# Initialize tqdm for pandas operations.
tqdm.pandas()

def robust_geocode(address, retries=3):
    """
    Try to geocode the address using multiple geocoders.
    If 'Brazil' is not in the address, append it.
    Returns (latitude, longitude, full_geocoded_address) 
    or (None, None, None) if all attempts fail.
    """
    geocoders = [
        Nominatim(user_agent="datacenters_geocoder"),
        ArcGIS()
    ]
    # Add "Brazil" if not present
    if "Brazil" not in address:
        address_mod = address + ", Brazil"
    else:
        address_mod = address

    for geocoder in geocoders:
        for attempt in range(retries):
            try:
                location = geocoder.geocode(address_mod, timeout=10)
                if location:
                    return location.latitude, location.longitude, location.address
            except Exception as e:
                time.sleep(1)
    return None, None, None

def extract_zipcode(text):
    """
    Extract a Brazilian CEP (zip code) in the format NNNNN-NNN from the text.
    Returns the CEP if found, else None.
    """
    m = re.search(r'(\d{5}-\d{3})', text)
    if m:
        return m.group(0)
    return None

def geocode_row(row):
    """
    For each DataFrame row, geocode the address.
    Also try to extract a CEP from both the original address and the geocoded address.
    Returns a Series with latitude, longitude, and zipcode.
    """
    lat, lon, full_address = robust_geocode(row['address'])
    # Try to extract from original address first.
    zipcode = extract_zipcode(row['address'])
    # If not found, try the full geocoded address.
    if not zipcode and full_address:
        zipcode = extract_zipcode(full_address)
    return pd.Series([lat, lon, zipcode])


In [57]:
# Example: apply this to your DataFrame (assumed to be clean_datacenters with an "address" column)
clean_datacenters[['latitude', 'longitude', 'zipcode', 'precision']] = clean_datacenters.progress_apply(geocode_row, axis=1, result_type='expand')

  0%|          | 0/142 [00:00<?, ?it/s]

In [68]:
clean_datacenters['has_number'] = clean_datacenters['address'].str.contains(r'\d')

In [127]:
# dropping rows with only city-level address info

# Define addresses to remove
bad_addresses = [
    "São Paulo, State of São Paulo, Brazil",
    "São Paulo, Brazil",
    "Vinhedo, State of São Paulo, Brazil",
    "Rio de Janeiro, RJ, Brazil",
    "Fortaleza - CE, Brazil"
]

# Drop rows matching these addresses
clean_datacenters = clean_datacenters[~clean_datacenters['address'].isin(bad_addresses)].reset_index(drop=True)

clean_datacenters = clean_datacenters[clean_datacenters.provider != 'MOD Mission Critical']

# Update the row where name equals the specified string
clean_datacenters.loc[clean_datacenters['name'] == 'TAKODA: Takoda Rio de Janeiro - RJ1', ['latitude', 'longitude']] = [-22.98414658606165, -43.43158945539098]
clean_datacenters.loc[clean_datacenters['name'] == 'Digital Realty: São Paulo SUM01 Data Center', ['latitude', 'longitude']] = [-23.50341, -46.75334]
clean_datacenters.loc[clean_datacenters['name'] == 'Digital Realty: São Paulo SUM02 Data Center', ['latitude', 'longitude']] = [-23.49610, -46.75504]
clean_datacenters.loc[clean_datacenters['name'] == 'Digital Realty: São Paulo SUM03 Data Center', ['latitude', 'longitude']] = [-23.49610, -46.75504]
clean_datacenters.loc[clean_datacenters['name'] == 'Digital Realty: São Paulo SUM04 Data Center', ['latitude', 'longitude']] = [-23.49610, -46.75504]
clean_datacenters.loc[clean_datacenters['name'] == 'Scala Data Centers: SGRUTB04', ['latitude', 'longitude']] = [-23.496797, -46.815908]
clean_datacenters.loc[clean_datacenters['name'] == 'Elea Data Centers: BSB1 Brasília Data Center', ['latitude', 'longitude']] = [-15.788539799910135, -47.88601920163078]
clean_datacenters.loc[clean_datacenters['name'] == 'Digital Realty: Fortaleza FTZ01 Data Center', ['latitude', 'longitude']] = [-3.830654303491639, -38.61262835299939]


clean_datacenters.loc[clean_datacenters['name'] == 'Latitude.sh: São Paulo I', 'MW_total_power'] = 1


In [131]:
path = Path.cwd().parent.parent / 'data' / 'processed' / 'brasil' / 'clean_datacenters-com_br.csv'

In [132]:
clean_datacenters[['name', 'provider', 'MW_total_power', 'sqft_total_space', 'latitude', 'longitude', 'zipcode']].to_csv(path)

# Datacentermap.com

In [77]:
# Define the path to the raw data file
raw_datacenters_path = Path.cwd().parent.parent / 'data' / 'processed' / 'brasil' / 'first_successes_datacentermap.csv'


# Load the raw data into a DataFrame
raw_datacenters = pd.read_csv(raw_datacenters_path)

# Drop the unnamed column
raw_datacenters = raw_datacenters.drop(columns='Unnamed: 0')

raw_datacenters = raw_datacenters.dropna(subset=['address'])

In [78]:
import pandas as pd
import ast
import re

def parse_stat_string(s):
    """
    Parse a statistic string and return a (key, value) tuple.
    For example:
      - "6 MW"          --> ("MW", "6")
      - "3,493 sq.m."   --> ("sq m", "3,493")
      - "Est.  2017"    --> ("Est", "2017")
    """
    s = s.strip()
    # Handle stats that start with "Est" (like establishment year)
    if s.startswith("Est"):
        # Remove "Est." and any extra spaces
        value = s.replace("Est.", "").strip()
        key = "Est"
        return key, value
    # Otherwise, try to match a number at the beginning
    m = re.match(r'([\d,\.]+)\s*(.*)', s)
    if m:
        value = m.group(1)
        key = m.group(2).strip().rstrip('.')  # remove trailing dot if any
        return key, value
    return s, None

def parse_statistics(stat_str):
    """
    Convert the string representation of a list (e.g. "['6 MW', '3,493 sq.m.']") into
    a dictionary mapping statistic keys to their values.
    """
    try:
        stat_list = ast.literal_eval(stat_str)
    except Exception:
        stat_list = []
    
    row_stats = {}
    if isinstance(stat_list, list):
        for item in stat_list:
            key, value = parse_stat_string(item)
            if key and value is not None:
                row_stats[key] = value
    return pd.Series(row_stats)

# Apply the parsing function to the "statistics" column of your DataFrame
stats_df = raw_datacenters["statistics"].apply(parse_statistics)

# Concatenate the resulting statistics DataFrame with your original DataFrame
raw_datacenters = pd.concat([raw_datacenters, stats_df], axis=1)

raw_datacenters.drop(['statistics', 'address'], axis=1, inplace=True)
raw_datacenters.rename({'transformed_address':'address'}, axis=1, inplace=True)

In [91]:
# Initialize tqdm for pandas operations.
tqdm.pandas()

def robust_geocode(address, retries=3):
    """
    Try to geocode the address using multiple geocoders.
    If 'Brazil' is not in the address, append it.
    Returns (latitude, longitude, full_geocoded_address) 
    or (None, None, None) if all attempts fail.
    """
    # Check that address is a non-empty string
    if not isinstance(address, str) or not address.strip():
        return None, None, None

    # List of geocoders to try
    geocoders = [
        Nominatim(user_agent="datacenters_geocoder", timeout=10),
        ArcGIS(timeout=10)
    ]
    
    # Append "Brazil" if not already present
    address_mod = address if "Brazil" in address else address + ", Brazil"

    for geocoder in geocoders:
        for attempt in range(retries):
            try:
                location = geocoder.geocode(address_mod, timeout=10)
                if location:
                    return location.latitude, location.longitude, location.address
            except Exception:
                time.sleep(1)
    return None, None, None

def geocode_row(row):
    """
    For each DataFrame row, geocode the address.
    Returns a Series with latitude and longitude.
    """
    lat, lon, _ = robust_geocode(row['address'])
    return pd.Series([lat, lon], index=['latitude', 'longitude'])

# Apply the geocoding function to each row of raw_datacenters.
raw_datacenters[['latitude', 'longitude']] = raw_datacenters.progress_apply(geocode_row, axis=1)

raw_datacenters.loc[raw_datacenters.url == 'https://www.datacentermap.com/brazil/tambore/scala-sgrutb01/specs/', ['latitude', 'longitude']] = [-23.493382568654592, -46.81059902826906]
raw_datacenters.loc[raw_datacenters.url == 'https://www.datacentermap.com/brazil/tambore/scala-sgrutb03/specs/', ['latitude', 'longitude']] = [-23.492998707280595, -46.81080363272966]

  0%|          | 0/158 [00:00<?, ?it/s]

In [103]:
processed_datacenters_path = Path.cwd().parent.parent / 'data' / 'processed' / 'brasil' / 'processed_datacentermap.csv'

processed_datacenters = raw_datacenters.to_csv(processed_datacenters_path)