In [None]:
import csv
import datetime as dt
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import random
import requests
import seaborn as sns
import sqlite3
from urllib.request import urlretrieve

random.seed(9)

## Nations

The following code uses a public API to generate nations data.

In [None]:
file_name = 'nations.csv'
if os.path.exists(file_name):
    with open(file_name) as fin:
        nations_data = list(csv.reader(fin))
else:
    nations = ['Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican Republic', 'Ecuador', 'El Salvador',
        'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran',
        'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Netherlands', 'Nicaragua', 'Outlying US', 'Peru',
        'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago',
        'United States', 'Vietnam', 'Yugoslavia']
    nations_data = []
    for nation in nations:
        url = f'https://nominatim.openstreetmap.org/search?q={nation}&format=json&limit=1'
        headers = {'User-Agent': 'ColabNotebook/1.0'}
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx).
            data = response.json()
            if data:
                nations_data.append([nation, float(data[0]['lat']), float(data[0]['lon'])])
            else:
                print(f'No data for {nation}')
        except requests.exceptions.RequestException as e:
            print(f'Error fetching data for {nation}: {e}')
    with open(file_name, 'wt') as fout:
        fout.write("nation,latitude,longitude\n")
        fout.writelines(nations_data)
print(*nations_data[:3], sep='\n')

In [None]:
with open('nations.csv', 'wt') as fout:
    fout.write("nation,latitude,longitude\n")
    fout.writelines([f"{d['nation']},{d['latitude']},{d['longitude']}\n" for d in nations_data])

## Incomes

The following code uses the UCI `adult` dataset. It includes both numerical and categorical data. It also has missing data.

In [None]:
url = 'https://huggingface.co/datasets/scikit-learn/adult-census-income/resolve/main/adult.csv'
file_name = os.path.basename(url)
if not os.path.exists(file_name):
    urlretrieve(url, file_name)

    # Update the "native.country" column to use the nations from the `nations` data.
    incomes_df = pd.read_csv(file_name)
    incomes_df['native.country'] = incomes_df['native.country'].str.replace('Holand-', '')
    incomes_df['native.country'] = incomes_df['native.country'].str.replace('-', ' ')
    incomes_df['native.country'] = incomes_df['native.country'].str.replace('&', ' & ')
    incomes_df['native.country'] = incomes_df['native.country'].str.replace('(Guam USVI etc)', '')
    incomes_df.replace('?', '', inplace=True)
    incomes_df.to_csv(file_name, index=False)
with open(file_name) as fin:
    reader = csv.reader(fin)
    print(*list(reader)[:3], sep='\n')

## SQL Database

The following code uses Pandas to import the data into a SQLite database.

In [None]:
# Make an in-memory SQLite database.
conn = sqlite3.connect(':memory:')

# Make a Pandas DataFrame for the nations and add its data to a SQL table.
nations_df = pd.DataFrame(nations_data)
nations_df.to_sql('nations', conn, if_exists='replace', index=False)

# Make a Pandas DataFrame for the income data and add its data to a SQL table.
incomes_df = pd.read_csv(file_name)
incomes_df.to_sql('incomes', conn, if_exists='replace', index=False)

# Inspect the tables.
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type = 'table';")
tables = cursor.fetchall()
print('Tables in database:', tables)

# Show the first few rows of the 'nations' table.
print("\nFirst three rows of 'nations' table:")
query_nations = pd.read_sql_query("SELECT * FROM nations LIMIT 3;", conn)
display(query_nations)

# Show the first few rows of the 'incomes' table.
print("\nFirst three rows of 'incomes' table:")
query_incomes = pd.read_sql_query("SELECT * FROM incomes LIMIT 3;", conn)
display(query_incomes)

## Joining and aggregating

The following code joins the two tables and aggregates the count by occupation.

In [None]:
sql_query = """
SELECT latitude
FROM nations;
"""
df = pd.read_sql_query(sql_query, conn)
print(f'{len(df)} rows')
df

In [None]:
sql_query = """
SELECT n.nation, n.latitude, n.longitude, i.date, i.amount
FROM nations AS n
INNER JOIN incomes AS i
ON n.nation = i.[native.country];
"""
joined_df = pd.read_sql_query(sql_query, conn)
print(f'{len(joined_df)} rows')
joined_df[::50]

In [None]:
sql_query = """
SELECT c.city, c.latitude, c.longitude, SUM(o.amount) as order_sum
FROM cities AS c
INNER JOIN orders AS o
ON c.city = o.city
GROUP BY c.city, c.latitude, c.longitude;
"""
aggregated_df = pd.read_sql_query(sql_query, conn)
print(f'{len(aggregated_df)} rows')
aggregated_df

## API Access

The following code uses a public API to get current temperature data for the cities.

In [None]:
def get_current_temperature(latitude, longitude):
    url = f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current_weather=true"
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx).
        data = response.json()
        if 'current_weather' in data and 'temperature' in data['current_weather']:
            return data['current_weather']['temperature']
        else:
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching weather data: {e}")
        return None

temperature_series = cities_df.apply(lambda row: get_current_temperature(row['latitude'], row['longitude']), axis=1)
temperature_df = pd.concat([cities_df['city'], temperature_series.to_frame('temperature')], axis=1)
temperature_df

## Data Persistence

The following code persists the temperature data as a CSV file.

In [None]:
temperature_df.to_csv('temperatures.csv', index=False)
with open('temperatures.csv', 'rt') as fin:
    print(fin.read())