# Exploratory Data Analysis 📁

## Get Data

In [2]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import requests

In [None]:
def get_noaa_data(token, datasetid, stationid, startdate, enddate, datatypeid):
    """ 
    Uses NOAA's API to retrieve climate data based on given parameters.
    Returns JSON data if successful.
    """
    url = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"
    headers = {"token": token}
    params = {
        "datasetid": datasetid,
        "stationid": stationid,
        "startdate": startdate,
        "enddate": enddate,
        "datatypeid": datatypeid,
        "limit": 1000,
        "units": "metric"
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

In [19]:
# Get token
load_dotenv()
token = os.getenv("NOAA_TOKEN")

data = get_noaa_data(
    token=token,
    datasetid="GSOM",
    stationid="GHCND:USW00014989",
    startdate="2017-01-01",
    enddate="2024-12-31",
    datatypeid="TAVG"
)

## Clean Data

In [None]:
results = data.get("results", [])
df = pd.DataFrame(results)
df["date"] = pd.to_datetime(df["date"])
df["value"] = df["value"]/10
df.to_parquet("../Data/raw_data.parquet")

Note: you may need to restart the kernel to use updated packages.


In [45]:
df = df.sort_values("date")

In [46]:
df.set_index("date", inplace=True)

In [42]:
df.isnull().sum()
df[df.isnull().any(axis=1)]

Unnamed: 0_level_0,datatype,station,attributes,value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [47]:
df = df["value"]

In [50]:
df.rename("AverageTemperature")

date
2017-02-01    0.47
2017-03-01    0.61
2017-04-01    1.06
2017-05-01    1.59
2017-06-01    2.30
              ... 
2024-08-01    2.14
2024-09-01    1.99
2024-10-01    1.45
2024-11-01    0.53
2024-12-01    0.00
Name: AverageTemperature, Length: 90, dtype: float64

In [52]:
df = df.to_frame()
df.to_parquet("../Data/clean_data.parquet")


## Exploratory Data

In [3]:
serie = pd.read_parquet("../Data/clean_data.parquet")
serie = pd.Series(serie["value"], name="AverageTemperature")

In [4]:
serie.info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 90 entries, 2017-02-01 to 2024-12-01
Series name: AverageTemperature
Non-Null Count  Dtype  
--------------  -----  
90 non-null     float64
dtypes: float64(1)
memory usage: 1.4 KB


📁 EDA
Get Data

Clean Data

Exploración inicial

.head(), .info(), .describe()

Gráfica de la serie completa

Revisión de nulos y outliers

Análisis de Estacionariedad

Tendencia y estacionalidad (visual)

Test de Dickey-Fuller (ADF)

Diferenciación si es necesario

Identificación inicial de parámetros

ACF y PACF

STL decomposition

👉 Aquí terminas con una serie transformada (estacionaria) y con sugerencias iniciales de (p,d,q)(P,D,Q).