# API Data Extraction Process and Code

---

Exploring processes and code to easily access data relevant to study/project.

### Project Hypothesis:
Socioeconomic status, as indicated by income levels, education attainment, and race/ethnicity, is a significant predictor of air quality and health outcomes. Communities with lower socioeconomic status are hypothesized to experience poorer air quality, which in turn leads to a higher prevalence of adverse health outcomes. This relationship is expected to persist even when controlling for potential confounding variables such as geographic location and access to healthcare services.

### Defining Data Collection Parameters
- **Geographic Scope:** Define countries or cities of interest
- **Time Frame:** Define time period coverage
- **Socioeconomic Indicators:** Define indicators of interest (e.g., median income, education level)



In [7]:
# Dependencies

import requests
import pandas as pd
import numpy as np
import time
import os


# Import the relevant API keys ( you will need )
from api_keys import weather_api_key
from api_keys import geoapify_key
from api_keys import aqicn_api_key
from api_keys import gho_who_api_key
from api_keys import api_ninjas_key

# Import citipy to determine the cities based on latitude and longitude
from citipy import citipy

#### Geographic scope


In [2]:
# Empty list for holding the latitude and longitude combinations
lat_lngs = []

# Empty dictionary for holding the city names and country codes
city_details = {}

# Range of latitudes and longitudes
lat_range = (-90, 90) # Min and Max bounds for latitude range
lng_range = (-180, 180) # Min and Max bounds for longitude range

# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=1500)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=1500)
lat_lngs = zip(lats, lngs) # Aggregate into tuple - pairing latitudes and longitudes

# Identify nearest city, country, and record their coordinates for each lat, lng combination
for lat, lng in lat_lngs:
    city = citipy.nearest_city(lat, lng)
    city_name = city.city_name
    country_code = city.country_code
    coords = (lat, lng)
    
    # If the city is unique, then add it along with the country code and coordinates
    if city_name not in city_details:
        city_details[city_name] = (country_code, coords)

# Print the city count to confirm sufficient count
print(f"Number of unique cities in the list: {len(city_details)}")

Number of unique cities in the list: 618


In [3]:
# Create a DataFrame from the collected data
cities_selected_df = pd.DataFrame({
    'City': [k for k in city_details.keys()],
    'Country': [v[0] for v in city_details.values()],
    'Coords': [v[1] for v in city_details.values()]
})

# Reset index to make sure it starts from 0 and acts as an index column
cities_selected_df.reset_index(inplace=True)
cities_selected_df.rename(columns={'index': 'Index'}, inplace=True)

# Print the DataFrame
print(cities_selected_df)

     Index                         City Country  \
0        0                      malpica      es   
1        1                       bethel      us   
2        2               ribeira grande      pt   
3        3  edinburgh of the seven seas      sh   
4        4                       albany      au   
..     ...                          ...     ...   
613    613                       murzuq      ly   
614    614             charlotte amalie      vi   
615    615                     kangding      cn   
616    616                    dingcheng      cn   
617    617                    groningen      sr   

                                        Coords  
0     (46.058636950605745, -9.777245982901945)  
1      (68.68194407511422, -164.0566600183777)  
2      (48.51075858434373, -32.88817223429896)  
3     (-37.98593329638902, -1.392334171241373)  
4    (-58.607549635501265, 124.70233480599364)  
..                                         ...  
613    (23.98361572585003, 13.64147703766013

#### Time frame

#### Socio-economic indicators

## API Data Extraction

### API Ninja

In [5]:
# Note: this is the code used - it uses about 5% of the month allocation of 10000 requests. One can batch into groups of 30 to reduce the number of requests.

cities_selected_df1=cities_selected_df.copy()

headers = {'X-Api-Key': api_ninjas_key}

# Empty list to store the results
results = []

# Function to fetch data for a city
def get_city_data(city):
    url = f'https://api.api-ninjas.com/v1/city?name={city}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to retrieve data for {city}, status code {response.status_code}")
        return None

# Loop through the DataFrame and call the API for each city
for row in cities_selected_df1.itertuples():
    # Convert city names to Title Case
    city_name_formatted = row.City.title()  # Corrected to use 'City' instead of 'city_name'
    data = get_city_data(city_name_formatted)
    if data:
        results.extend(data)  # Extend in case data is a list of multiple cities
    # Implement a delay between API calls to avoid hitting the rate limit
    time.sleep(1)

# Convert results to a DataFrame
df_cities_results = pd.DataFrame(results)

# Create the directory if it does not exist
if not os.path.exists('Data'):
    os.makedirs('Data')

# Save the DataFrame to a CSV file
df_cities_results.to_csv("Data/city_data_ww.csv", index =False)

In [14]:
# Add 'is_rural' variable to the dataframe

# 1. Make a copy of the DataFrame
df_cities_results1 = df_cities_results.copy()

# 2. Generate a column called is_rural
# Assuming the population column is named "Population"
df_cities_results1['is_rural'] = df_cities_results1['population'] < 15000  # This will return True for rural and False for urban

# 3. Loop through the populations against measures
# This step is not needed as the vectorized operation above handles the classification based on population

# 4. Output to "Data/city_data.csv"
df_cities_results1.to_csv("Data/city_data1_ww.csv", index=False)  # index=False to avoid writing row indices in the CSV file

### AQICN API

In [18]:
cities_selected_df2=df_cities_results1.copy()

# Define the endpoint and parameters for your request
endpoint = 'http://api.waqi.info/feed/'
city = 'name'
params = {
    'token': aqicn_api_key
}

# Make the request and collect the data
response = requests.get(f'{endpoint}/{city}/', params=params)
data = response.json()

data


{'status': 'ok',
 'data': {'aqi': 28,
  'idx': 12842,
  'attributions': [{'url': 'https://www.irceline.be/en/',
    'name': 'IRCEL-CELINE - Belgian Interregional Environment Agency',
    'logo': 'Beligium-irceline.png'},
   {'url': 'https://waqi.info/', 'name': 'World Air Quality Index Project'}],
  'city': {'geo': [50.462427939605, 4.8652609460116],
   'name': 'Namur, Belgium',
   'url': 'https://aqicn.org/city/belgium/wal/namur',
   'location': ''},
  'dominentpol': 'pm25',
  'iaqi': {'h': {'v': 82},
   'p': {'v': 1003.3},
   'pm10': {'v': 14},
   'pm25': {'v': 28},
   't': {'v': 8.3},
   'w': {'v': 1.5},
   'wg': {'v': 5.5}},
  'time': {'s': '2024-04-26 10:00:00',
   'tz': '+02:00',
   'v': 1714125600,
   'iso': '2024-04-26T10:00:00+02:00'},
  'forecast': {'daily': {'o3': [{'avg': 31,
      'day': '2024-04-25',
      'max': 39,
      'min': 21},
     {'avg': 29, 'day': '2024-04-26', 'max': 39, 'min': 24},
     {'avg': 27, 'day': '2024-04-27', 'max': 36, 'min': 20},
     {'avg': 34, 

In [20]:
# Assuming df_cities_results1 is the original DataFrame
df_city_results_aqicn_ww = cities_selected_df.copy()

# Define the endpoint and the API key
endpoint = 'http://api.waqi.info/feed/'
api_key = aqicn_api_key  # Make sure to replace this with your actual API key

# Prepare to collect the AQI data
aqi_data = []

# Loop through each city in the DataFrame
for city in df_city_results_aqicn_ww['City']:
    # Make the request and collect the data
    response = requests.get(f'{endpoint}/{city}/', params={'token': api_key})
    data = response.json()
    
    # Check if the response is successful and data is available
    if data['status'] == 'ok' and 'data' in data and 'iaqi' in data['data']:
        # Extract air quality information
        aq_data = data['data']['iaqi']
        aq_data['city'] = city  # Add the city name to the dictionary
        aqi_data.append(aq_data)

# Convert the list of dictionaries to a DataFrame
df_aq = pd.DataFrame.from_records(aqi_data)

# If you want each pollutant in its own column, you may need to transform the data
# Normalize the data to expand the nested dictionaries into their own columns
df_aqicn_expanded = pd.json_normalize(df_aq.drop(columns=['city']).to_dict(orient='records'))
df_aqicn_expanded['city'] = df_aq['city']  # Add the city column back after normalization

# Output the DataFrame to verify
print(df_aqicn_expanded.head())

# Optional: Save the DataFrame to a CSV file
df_aqicn_expanded.to_csv('Data/aqi_data.csv', index=False)


   so2  aqi  dew  pm10   r  uvi   d  wd  co.v   h.v  ...  pm10.v   h   p   t  \
0  NaN  NaN  NaN   NaN NaN  NaN NaN NaN   3.2   3.0  ...     NaN NaN NaN NaN   
1  NaN  NaN  NaN   NaN NaN  NaN NaN NaN   2.3  60.3  ...     NaN NaN NaN NaN   
2  NaN  NaN  NaN   NaN NaN  NaN NaN NaN   2.7  84.5  ...     NaN NaN NaN NaN   
3  NaN  NaN  NaN   NaN NaN  NaN NaN NaN   NaN  66.0  ...     NaN NaN NaN NaN   
4  NaN  NaN  NaN   NaN NaN  NaN NaN NaN   NaN  97.0  ...    58.0 NaN NaN NaN   

    w  r.v  uvi.v  d.v  wd.v        city  
0 NaN  NaN    NaN  NaN   NaN      bethel  
1 NaN  NaN    NaN  NaN   NaN      albany  
2 NaN  NaN    NaN  NaN   NaN    hamilton  
3 NaN  NaN    NaN  NaN   NaN  georgetown  
4 NaN  NaN    NaN  NaN   NaN      kourou  

[5 rows x 35 columns]


### OpenWeatherMap API

In [21]:
df_city_results_openweather=df_cities_results.copy()

# API endpoint and your OpenWeatherMap API key
api_key = weather_api_key
endpoint = 'http://api.openweathermap.org/data/2.5/air_pollution'

# Prepare to collect the air pollution data
air_pollution_data = []

# Loop through each row in the DataFrame
for index, row in df_city_results_openweather.iterrows():
    # Extract latitude and longitude
    lat, lon = row['latitude'], row['longitude']
    
    # Make the request
    response = requests.get(f"{endpoint}?lat={lat}&lon={lon}&appid={api_key}")
    data = response.json()
    
    # Check if the response is successful and data is available
    if response.status_code == 200 and 'list' in data:
        # Extract air pollution details (taking the first item in 'list' as example)
        pollution_info = data['list'][0]
        
        # Add the pollution data to the DataFrame row
        for key, value in pollution_info.items():
            if isinstance(value, dict):  # This check is to flatten nested dictionaries
                for subkey, subvalue in value.items():
                    df_city_results_openweather.at[index, f"{key}_{subkey}"] = subvalue
            else:
                df_city_results_openweather.at[index, key] = value

# Optional: Drop the original latitude and longitude columns if you don't want them duplicated
# df = df.drop(columns=['latitude', 'longitude'])

# Save the DataFrame with air pollution data to a CSV file
df_city_results_openweather.to_csv('Data/combined_city_pollution_data.csv', index=False)

print("Data saved successfully.")

Data saved successfully.
