# Historical Data Fetching

Fetch historical weather and air quality data from APIs

## Import Libraries

In [15]:
import openmeteo_requests
import pandas as pd
import requests
import requests_cache
from retry_requests import retry
from datetime import datetime
from pathlib import Path
import os
from dotenv import load_dotenv
import logging

# Load environment variables
load_dotenv()

# Setup paths
BASE_DIR = Path.cwd().parent
RAW_DATA_PATH = BASE_DIR / 'data' / 'raw'
LOGS_PATH = BASE_DIR / 'logs'

# Create directories
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)
LOGS_PATH.mkdir(parents=True, exist_ok=True)

# Setup logging
log_file = LOGS_PATH / f'fetch_data_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)

logging.info('Initialization complete')

2026-02-05 21:08:29,297 - INFO - Initialization complete


## Fetch Historical Weather Data from Open-Meteo

In [16]:
def fetch_historical_weather(lat, lon, start_date, end_date):
    """
    Fetch historical weather data from Open-Meteo API
    
    Parameters:
        lat (float): Latitude
        lon (float): Longitude
        start_date (str): Start date in format 'YYYY-MM-DD'
        end_date (str): End date in format 'YYYY-MM-DD'
    
    Returns:
        pd.DataFrame: Historical weather data
    """
    logging.info(f'Fetching weather data from {start_date} to {end_date}')
    
    try:
        # Setup API client
        cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
        retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
        openmeteo = openmeteo_requests.Client(session=retry_session)
        
        url = "https://archive-api.open-meteo.com/v1/archive"
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": start_date,
            "end_date": end_date,
            "hourly": [
                "temperature_2m",
                "relative_humidity_2m",
                "dew_point_2m",
                "precipitation",
                "wind_speed_10m",
                "wind_direction_10m",
                "surface_pressure"
            ]
        }
        
        responses = openmeteo.weather_api(url, params=params)
        response = responses[0]
        
        logging.info(f'Coordinates: {response.Latitude()}N {response.Longitude()}E')
        logging.info(f'Elevation: {response.Elevation()}m')
        
        # Process hourly data
        hourly = response.Hourly()
        
        hourly_data = {
            "timestamp": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "temperature": hourly.Variables(0).ValuesAsNumpy(),
            "humidity": hourly.Variables(1).ValuesAsNumpy(),
            "dew_point": hourly.Variables(2).ValuesAsNumpy(),
            "precipitation": hourly.Variables(3).ValuesAsNumpy(),
            "wind_speed": hourly.Variables(4).ValuesAsNumpy(),
            "wind_direction": hourly.Variables(5).ValuesAsNumpy(),
            "pressure": hourly.Variables(6).ValuesAsNumpy()
        }
        
        df = pd.DataFrame(data=hourly_data)
        logging.info(f'Fetched {len(df)} weather records')
        
        return df
        
    except Exception as e:
        logging.error(f'Error fetching weather data: {e}')
        return None

# Fetch data for Hyderabad, Sindh
lat = 25.3960
lon = 68.3578
start_date = "2025-11-01"
end_date = "2026-01-31"

weather_df = fetch_historical_weather(lat, lon, start_date, end_date)

2026-02-05 21:09:00,776 - INFO - Fetching weather data from 2025-11-01 to 2026-01-31
2026-02-05 21:09:02,174 - INFO - Coordinates: 25.41300392150879N 68.35319519042969E
2026-02-05 21:09:02,174 - INFO - Elevation: 29.0m
2026-02-05 21:09:02,267 - INFO - Fetched 2208 weather records


In [17]:
# Save weather data to CSV
if weather_df is not None:
    output_file = RAW_DATA_PATH / 'historical_weather_data.csv'
    weather_df.to_csv(output_file, index=False)
    logging.info(f'Saved weather data to {output_file}')
    print(f'Weather data shape: {weather_df.shape}')
    print(weather_df.head())
else:
    logging.error('Failed to fetch weather data')

2026-02-05 21:09:02,428 - INFO - Saved weather data to d:\Internships and Jobs Data\10 Pearls Shine Internship\Project\AQI Predictor\data\raw\historical_weather_data.csv


Weather data shape: (2208, 8)
                  timestamp  temperature   humidity  dew_point  precipitation  \
0 2025-11-01 00:00:00+00:00    22.450001  53.880459      12.65            0.0   
1 2025-11-01 01:00:00+00:00    21.350000  56.498665      12.35            0.0   
2 2025-11-01 02:00:00+00:00    21.200001  62.887741      13.85            0.0   
3 2025-11-01 03:00:00+00:00    23.299999  51.681416      12.80            0.0   
4 2025-11-01 04:00:00+00:00    26.549999  42.433483      12.75            0.0   

   wind_speed  wind_direction     pressure  
0    1.548419      125.537766  1006.721191  
1    0.360000      270.000000  1007.007690  
2    0.569210      341.564941  1008.102417  
3    3.362677      105.524178  1008.923706  
4    2.174948      114.443947  1009.458496  


## Fetch Historical Air Pollution Data from OpenWeather

In [18]:
def fetch_historical_air_quality(lat, lon, start_date, end_date):
    """
    Fetch historical air quality data from OpenWeather API
    
    Parameters:
        lat (float): Latitude
        lon (float): Longitude
        start_date (str): Start date in format 'YYYY-MM-DD'
        end_date (str): End date in format 'YYYY-MM-DD'
    
    Returns:
        pd.DataFrame: Historical air quality data
    """
    logging.info(f'Fetching air quality data from {start_date} to {end_date}')
    
    try:
        # Get API key
        api_key = os.getenv('OPENWEATHER_API_KEY')
        if not api_key:
            raise ValueError('OPENWEATHER_API_KEY not found in environment variables')
        
        # Convert dates to Unix timestamps
        start_dt = datetime.strptime(start_date, '%Y-%m-%d')
        end_dt = datetime.strptime(end_date, '%Y-%m-%d')
        start_time = int(start_dt.timestamp())
        end_time = int(end_dt.timestamp())
        
        # OpenWeather Air Pollution History API
        url = f'http://api.openweathermap.org/data/2.5/air_pollution/history'
        params = {
            'lat': lat,
            'lon': lon,
            'start': start_time,
            'end': end_time,
            'appid': api_key
        }
        
        logging.info(f'Requesting data from OpenWeather API')
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        
        data = response.json()
        
        if 'list' not in data:
            logging.error(f'Unexpected API response: {data}')
            return None
        
        # Parse response data
        records = []
        for item in data['list']:
            record = {
                'timestamp': pd.to_datetime(item['dt'], unit='s', utc=True),
                'aqi': item['main']['aqi'],
                'co': item['components']['co'],
                'no': item['components'].get('no', 0),
                'no2': item['components']['no2'],
                'o3': item['components']['o3'],
                'so2': item['components']['so2'],
                'pm2_5': item['components']['pm2_5'],
                'pm10': item['components']['pm10'],
                'nh3': item['components']['nh3']
            }
            records.append(record)
        
        df = pd.DataFrame(records)
        logging.info(f'Fetched {len(df)} air quality records')
        
        return df
        
    except requests.exceptions.RequestException as e:
        logging.error(f'Error fetching air quality data: {e}')
        return None
    except Exception as e:
        logging.error(f'Error processing air quality data: {e}')
        return None

# Fetch data for Hyderabad, Sindh
lat = 25.3960
lon = 68.3578
start_date = "2025-11-01"
end_date = "2026-01-31"

# Fetch data for Hyderabad, Sindh
pollution_df = fetch_historical_air_quality(lat, lon, start_date, end_date)

2026-02-05 21:09:08,997 - INFO - Fetching air quality data from 2025-11-01 to 2026-01-31
2026-02-05 21:09:09,010 - INFO - Requesting data from OpenWeather API
2026-02-05 21:09:12,243 - INFO - Fetched 2137 air quality records


In [19]:
# Save pollution data to CSV
if pollution_df is not None:
    output_file = RAW_DATA_PATH / 'historical_pollution_data.csv'
    pollution_df.to_csv(output_file, index=False)
    logging.info(f'Saved pollution data to {output_file}')
    print(f'Pollution data shape: {pollution_df.shape}')
    print(pollution_df.head())
else:
    logging.error('Failed to fetch pollution data')

2026-02-05 21:09:15,424 - INFO - Saved pollution data to d:\Internships and Jobs Data\10 Pearls Shine Internship\Project\AQI Predictor\data\raw\historical_pollution_data.csv


Pollution data shape: (2137, 10)
                  timestamp  aqi      co   no   no2     o3   so2  pm2_5  \
0 2025-10-31 19:00:00+00:00    5  469.14  0.0  5.76  90.53  4.86  76.24   
1 2025-10-31 20:00:00+00:00    5  474.46  0.0  5.48  80.74  3.73  75.22   
2 2025-10-31 21:00:00+00:00    4  477.31  0.0  4.92  71.45  2.70  74.28   
3 2025-10-31 22:00:00+00:00    4  484.54  0.0  4.49  63.97  2.00  74.11   
4 2025-10-31 23:00:00+00:00    5  498.30  0.0  4.20  59.45  1.62  75.13   

     pm10   nh3  
0  126.14  4.67  
1  122.29  4.85  
2  119.34  4.93  
3  119.01  5.06  
4  121.99  5.33  


## Verify Data Quality

In [20]:
# Check for missing values
if weather_df is not None:
    print('Weather Data Info:')
    print(f'Total records: {len(weather_df)}')
    print(f'Date range: {weather_df["timestamp"].min()} to {weather_df["timestamp"].max()}')
    print(f'Missing values:\n{weather_df.isnull().sum()}')
    print()

if pollution_df is not None:
    print('Pollution Data Info:')
    print(f'Total records: {len(pollution_df)}')
    print(f'Date range: {pollution_df["timestamp"].min()} to {pollution_df["timestamp"].max()}')
    print(f'Missing values:\n{pollution_df.isnull().sum()}')

logging.info('Data fetching complete')

2026-02-05 21:09:17,997 - INFO - Data fetching complete


Weather Data Info:
Total records: 2208
Date range: 2025-11-01 00:00:00+00:00 to 2026-01-31 23:00:00+00:00
Missing values:
timestamp         0
temperature       0
humidity          0
dew_point         0
precipitation     0
wind_speed        0
wind_direction    0
pressure          0
dtype: int64

Pollution Data Info:
Total records: 2137
Date range: 2025-10-31 19:00:00+00:00 to 2026-01-30 19:00:00+00:00
Missing values:
timestamp    0
aqi          0
co           0
no           0
no2          0
o3           0
so2          0
pm2_5        0
pm10         0
nh3          0
dtype: int64


## Fetch Current Weather Data from OpenWeather

In [21]:
def fetch_current_weather(lat, lon):
    """
    Fetch current weather data from OpenWeather API
    
    Parameters:
        lat (float): Latitude
        lon (float): Longitude
    
    Returns:
        pd.DataFrame: Current weather data
    """
    logging.info('Fetching current weather data')
    
    try:
        api_key = os.getenv('OPENWEATHER_API_KEY')
        if not api_key:
            raise ValueError('OPENWEATHER_API_KEY not found in environment variables')
        
        url = 'https://api.openweathermap.org/data/2.5/weather'
        params = {
            'lat': lat,
            'lon': lon,
            'appid': api_key,
            'units': 'metric'
        }
        
        logging.info('Requesting current weather from OpenWeather API')
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        
        data = response.json()
        
        record = {
            'timestamp': pd.to_datetime(data['dt'], unit='s', utc=True),
            'city': data.get('name'),
            'country': data.get('sys', {}).get('country'),
            'latitude': data.get('coord', {}).get('lat'),
            'longitude': data.get('coord', {}).get('lon'),
            'temperature': data.get('main', {}).get('temp'),
            'feels_like': data.get('main', {}).get('feels_like'),
            'temp_min': data.get('main', {}).get('temp_min'),
            'temp_max': data.get('main', {}).get('temp_max'),
            'pressure': data.get('main', {}).get('pressure'),
            'humidity': data.get('main', {}).get('humidity'),
            'wind_speed': data.get('wind', {}).get('speed'),
            'wind_deg': data.get('wind', {}).get('deg'),
            'cloudiness': data.get('clouds', {}).get('all'),
            'visibility': data.get('visibility'),
            'weather_main': data.get('weather', [{}])[0].get('main'),
            'weather_description': data.get('weather', [{}])[0].get('description')
        }
        
        df = pd.DataFrame([record])
        logging.info('Fetched current weather data successfully')
        
        return df
        
    except requests.exceptions.RequestException as e:
        logging.error(f'Error fetching current weather: {e}')
        return None
    except Exception as e:
        logging.error(f'Error processing current weather: {e}')
        return None

current_weather_df = fetch_current_weather(lat, lon)

2026-02-05 21:09:25,615 - INFO - Fetching current weather data
2026-02-05 21:09:25,619 - INFO - Requesting current weather from OpenWeather API


2026-02-05 21:09:25,994 - INFO - Fetched current weather data successfully


In [22]:
if current_weather_df is not None:
    output_file = RAW_DATA_PATH / 'current_weather_data.csv'
    
    if output_file.exists():
        existing_df = pd.read_csv(output_file)
        current_weather_df = pd.concat([existing_df, current_weather_df], ignore_index=True)
    
    current_weather_df.to_csv(output_file, index=False)
    logging.info(f'Saved current weather to {output_file}')
    print(f'Current weather data shape: {current_weather_df.shape}')
    print(current_weather_df.tail())
else:
    logging.error('Failed to fetch current weather')

2026-02-05 21:09:26,735 - INFO - Saved current weather to d:\Internships and Jobs Data\10 Pearls Shine Internship\Project\AQI Predictor\data\raw\current_weather_data.csv


Current weather data shape: (2, 17)
                   timestamp       city country  latitude  longitude  \
0  2026-02-05 12:56:51+00:00  Hyder훮b훮d      PK    25.396    68.3578   
1  2026-02-05 16:09:45+00:00  Hyder훮b훮d      PK    25.396    68.3578   

   temperature  feels_like  temp_min  temp_max  pressure  humidity  \
0        25.56       24.69     25.56     25.56      1018        20   
1        21.99       20.87     21.99     21.99      1020        24   

   wind_speed  wind_deg  cloudiness  visibility weather_main  \
0        2.85         2           0       10000        Clear   
1        4.10        21           0       10000        Clear   

  weather_description  
0           clear sky  
1           clear sky  


## Fetch Current Air Pollution Data from OpenWeather

In [23]:
def fetch_current_pollution(lat, lon):
    """
    Fetch current air pollution data from OpenWeather API
    
    Parameters:
        lat (float): Latitude
        lon (float): Longitude
    
    Returns:
        pd.DataFrame: Current air pollution data
    """
    logging.info('Fetching current air pollution data')
    
    try:
        api_key = os.getenv('OPENWEATHER_API_KEY')
        if not api_key:
            raise ValueError('OPENWEATHER_API_KEY not found in environment variables')
        
        url = 'http://api.openweathermap.org/data/2.5/air_pollution'
        params = {
            'lat': lat,
            'lon': lon,
            'appid': api_key
        }
        
        logging.info('Requesting current pollution from OpenWeather API')
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        
        data = response.json()
        
        if 'list' not in data:
            logging.error(f'Unexpected API response: {data}')
            return None
        
        item = data['list'][0]
        record = {
            'timestamp': pd.to_datetime(item['dt'], unit='s', utc=True),
            'aqi': item['main']['aqi'],
            'co': item['components']['co'],
            'no': item['components'].get('no', 0),
            'no2': item['components']['no2'],
            'o3': item['components']['o3'],
            'so2': item['components']['so2'],
            'pm2_5': item['components']['pm2_5'],
            'pm10': item['components']['pm10'],
            'nh3': item['components']['nh3']
        }
        
        df = pd.DataFrame([record])
        logging.info('Fetched current pollution data successfully')
        
        return df
        
    except requests.exceptions.RequestException as e:
        logging.error(f'Error fetching current pollution: {e}')
        return None
    except Exception as e:
        logging.error(f'Error processing current pollution: {e}')
        return None

current_pollution_df = fetch_current_pollution(lat, lon)

2026-02-05 21:09:28,062 - INFO - Fetching current air pollution data
2026-02-05 21:09:28,065 - INFO - Requesting current pollution from OpenWeather API
2026-02-05 21:09:28,444 - INFO - Fetched current pollution data successfully


In [24]:
if current_pollution_df is not None:
    output_file = RAW_DATA_PATH / 'current_pollution_data.csv'
    
    if output_file.exists():
        existing_df = pd.read_csv(output_file)
        current_pollution_df = pd.concat([existing_df, current_pollution_df], ignore_index=True)
    
    current_pollution_df.to_csv(output_file, index=False)
    logging.info(f'Saved current pollution to {output_file}')
    print(f'Current pollution data shape: {current_pollution_df.shape}')
    print(current_pollution_df.tail())
else:
    logging.error('Failed to fetch current pollution')

2026-02-05 21:09:28,813 - INFO - Saved current pollution to d:\Internships and Jobs Data\10 Pearls Shine Internship\Project\AQI Predictor\data\raw\current_pollution_data.csv


Current pollution data shape: (2, 10)
                   timestamp  aqi      co    no   no2      o3   so2  pm2_5  \
0  2026-02-05 12:56:49+00:00    4  406.80  0.06  6.21  140.75  6.21  67.61   
1  2026-02-05 16:09:48+00:00    4  530.46  0.00  9.82  107.48  5.33  68.36   

     pm10   nh3  
0  117.05  7.37  
1  115.30  9.47  


## Summary

In [25]:
print('Data Collection Summary:')
print('=' * 60)
print(f'Historical Weather: {weather_df.shape if weather_df is not None else "Failed"}')
print(f'Historical Pollution: {pollution_df.shape if pollution_df is not None else "Failed"}')
print(f'Current Weather: {current_weather_df.shape if current_weather_df is not None else "Failed"}')
print(f'Current Pollution: {current_pollution_df.shape if current_pollution_df is not None else "Failed"}')
print('=' * 60)
print(f'Files saved in: {RAW_DATA_PATH}')
print(f'Log file: {log_file}')
print('=' * 60)

logging.info('All data collection tasks completed')

2026-02-05 21:09:34,151 - INFO - All data collection tasks completed


Data Collection Summary:
Historical Weather: (2208, 8)
Historical Pollution: (2137, 10)
Current Weather: (2, 17)
Current Pollution: (2, 10)
Files saved in: d:\Internships and Jobs Data\10 Pearls Shine Internship\Project\AQI Predictor\data\raw
Log file: d:\Internships and Jobs Data\10 Pearls Shine Internship\Project\AQI Predictor\logs\fetch_data_20260205_210828.log
