# Climate Data Integration and Analysis

The purpose of this project is to demonstrate technical proficiency in acquiring, cleaning, transforming, organizing, and presenting data. This involves leveraging multiple data acquisition techniques such as APIs, web scraping, and SQL queries to collect data from at least two sources. The project aims to wrangle these datasets into a cohesive and analysis-ready format while thoroughly documenting the entire process within a Python package. By completing this project, I intend to showcase my ability to handle complex data workflows and effectively present results in a user-friendly manner.

In [None]:
import requests
import pandas as pd
import sqlite3
from bs4 import BeautifulSoup
import re
import os
import pytest

# Constants for API and URLs
API_KEY = "your_openweathermap_api_key"
BASE_API_URL = "http://api.openweathermap.org/data/2.5/forecast"
NOAA_URL = "https://www.ncdc.noaa.gov/cdo-web/datasets"
SQLITE_DB_PATH = "climate_data.db"

# Function to fetch data from OpenWeatherMap API
def fetch_weather_data(city, country, api_key=API_KEY):
    params = {
        "q": f"{city},{country}",
        "appid": api_key,
        "units": "metric",
    }
    response = requests.get(BASE_API_URL, params=params)
    response.raise_for_status()
    return response.json()

# Function to scrape NOAA data
def scrape_noaa_data(url=NOAA_URL):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    data = []
    for row in soup.find_all("tr")[1:]:  # Assuming data is in table rows
        cols = row.find_all("td")
        data.append([col.text.strip() for col in cols])
    return pd.DataFrame(data)

# Function to query SQLite database
def query_sqlite_data(db_path, query):
    with sqlite3.connect(db_path) as conn:
        return pd.read_sql_query(query, conn)

# Function to clean and merge data
def clean_and_merge_data(weather_data, noaa_data, sqlite_data):
    weather_df = pd.json_normalize(weather_data['list'])
    noaa_df = noaa_data
    merged_df = pd.concat([weather_df, noaa_df, sqlite_data], axis=1)

    # Cleaning steps
    merged_df = merged_df.dropna()
    merged_df.columns = [re.sub(r"[^a-zA-Z0-9_]", "", col) for col in merged_df.columns]
    return merged_df

# Function to save the dataset
def save_dataset(df, output_path="cleaned_data.csv"):
    df.to_csv(output_path, index=False)

# Python package functions
if not os.path.exists("climate_package"):
    os.makedirs("climate_package")

# Writing README.md
with open("climate_package/README.md", "w") as f:
    f.write("""
    # Climate Data Integration Package

    This package provides functions for acquiring, cleaning, and analyzing climate data from multiple sources.

    ## Installation
    ```
    pip install climate_package
    ```

    ## Usage
    ```python
    from climate_package import fetch_weather_data, scrape_noaa_data, query_sqlite_data, clean_and_merge_data

    weather_data = fetch_weather_data("Nairobi", "KE")
    noaa_data = scrape_noaa_data()
    sqlite_data = query_sqlite_data("climate_data.db", "SELECT * FROM climate")
    merged_data = clean_and_merge_data(weather_data, noaa_data, sqlite_data)
    ```
    """)

# Writing package functions
package_code = """
def fetch_weather_data(city, country, api_key):
    ...
def scrape_noaa_data(url):
    ...
def query_sqlite_data(db_path, query):
    ...
def clean_and_merge_data(weather_data, noaa_data, sqlite_data):
    ...
def save_dataset(df, output_path):
    ...
"""
with open("climate_package/__init__.py", "w") as f:
    f.write(package_code)

# Pytest for unit testing
def test_fetch_weather_data():
    result = fetch_weather_data("Nairobi", "KE")
    assert "list" in result

def test_scrape_noaa_data():
    result = scrape_noaa_data()
    assert not result.empty

def test_query_sqlite_data():
    result = query_sqlite_data(SQLITE_DB_PATH, "SELECT * FROM climate")
    assert not result.empty

def test_clean_and_merge_data():
    weather_data = fetch_weather_data("Nairobi", "KE")
    noaa_data = scrape_noaa_data()
    sqlite_data = query_sqlite_data(SQLITE_DB_PATH, "SELECT * FROM climate")
    result = clean_and_merge_data(weather_data, noaa_data, sqlite_data)
    assert not result.empty

# Running the tests
if __name__ == "__main__":
    pytest.main()


# THE END