## HW0 ASSIGNMENT 1

### OBJECTIVES

- Familiarize yourself with Github and basic git
- Familiarize yourself with the JupyterLab environment, Markdown, and Python
- Explore the JupyterHub Linux console integrating what you learned in the prior parts of this homework
- Perform basic data engineering in Python using NOAA weather data

### Obtaining API Key 

1. Access URL: https://www.ncdc.noaa.gov/cdo-web/token
2. Enter your Email Address in the above URL.
3. Access Token is delivered to the Email..W 

### Importing Libraries

In [2]:
import requests
import os
import json 
from datetime import datetime
import pandas as pd

### ยง Task: Use Python to make HTTP/API calls to NOAA services and obtain data.

#### Creation of JSON files

In [18]:
## Function to fetch data from NOAA in a given range
def get_noaa_data_range(start_date, end_date, api_token):
    data_url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data'
    params = {
        'datasetid' : 'GHCND',
        'locationid' : 'ZIP:80249',
        'units' : 'standard',
        'startdate' : start_date,
        'enddate' : end_date,
        'limit' : 1000
    }
    headers = {'token': api_token}
    
    response = requests.get(data_url, headers = headers, params = params)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Return the JSON response
        return response.json()
    else:
        # If request was not successful, print error message
        print("Error:", response.status_code, response.text)

# Create a directory to store the data
if not os.path.exists('data'):
    os.makedirs('data')

api_token = 'CTXujlIECPVrbVTpfUkRWzUqSfEuBvKW'

## Looping through the year 2008-2022 and between dates 12/15 to 1/21
for year in range(2008, 2023):
    start_date = f'{year}-12-15'
    end_date = f'{year+1}-01-21'
    data = get_noaa_data_range(start_date, end_date, api_token)
    
    with open(f'data/winter_{year}-{year+1}.json', 'w') as f:
        json.dump(data, f)
        
print("Data retrieval and storage complete.")

Data retrieval and storage complete.


### ยง Task: Extract, transform, and export JSON data

In [25]:
## Folder containing data 
data_folder = 'data/'

## Initialize data frame to store data 
transform_json_data = pd.DataFrame()

# Loop through all JSON files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith('.json'):
        # Read the JSON file into a DataFrame
        with open(os.path.join(data_folder, filename), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame(data['results'])

            # Convert the 'date' column to datetime format
            df['date'] = pd.to_datetime(df['date'])
            # Extract only the date part (YYYY-MM-DD) from the datetime column
            df['date'] = df['date'].dt.strftime('%Y-%m-%d')

            # Extracting the values of TMAX and TMIN and renaming the columns respectively
            df_tmax = df[df['datatype'] == 'TMAX'][['date', 'value']]
            df_tmin = df[df['datatype'] == 'TMIN'][['date', 'value']]
            df_tmax.rename(columns = {'value' : 'TMAX'}, inplace = True)
            df_tmin.rename(columns = {'value' : 'TMIN'}, inplace = True)
            
            # Merge df_tmax and df_tmin on the 'date' column
            df_max_min = pd.merge(df_tmax, df_tmin, on = 'date')

            # Calculate TAVG 
            df_max_min['TAVG'] = (df_max_min['TMAX'] + df_max_min['TMIN']) / 2
            
            # Adding processed data to weather_data
            transform_json_data = pd.concat([transform_json_data, df_max_min])

# Sort by date
transform_json_data.sort_index(inplace=True)

# Saving to CSV 
csv_file_path = os.path.join(data_folder, 'all_data_max_min_avg.csv')
transform_json_data.to_csv(csv_file_path)

print(f'The file has been successfully generated and saved to path {csv_file_path}')

The file has been successfully generated and saved to path data/all_data_max_min_avg.csv


### ยง Task: Filter, transform and export CSV data.

In [44]:
## Folder containing data 
data_folder = 'data/'

## Initialize data frame to store data 
compiled_data = pd.DataFrame()

# Loop through all JSON files in the data folder
for year in range (2008, 2023):
    for filename in os.listdir(data_folder):
        if filename.endswith('.json'):
        # Read the JSON file into a DataFrame
            with open(os.path.join(data_folder, filename), 'r') as f:
                data = json.load(f)['results']
                tmax_values = {}
                tmin_values = {}
                for record in data:
                    date_str = record['date'][5:10]
                    if record['datatype'] == 'TMAX':
                        tmax_values[date_str] = record['value']
                    elif record['datatype'] == 'TMIN':
                        tmin_values[date_str] = record['value']

                for date_str in tmax_values:
                    if date_str in tmin_values:
                        tavg = (tmax_values[date_str] + tmin_values[date_str]) / 2


{'12-15': 54.0, '12-16': 52.0, '12-17': 44.0, '12-18': 45.0, '12-19': 61.0, '12-20': 60.0, '12-21': 63.0, '12-22': 64.0, '12-23': 62.0, '12-24': 52.0, '12-25': 55.0, '12-26': 53.0, '12-27': 48.0, '12-28': 33.0, '12-29': 33.0, '12-30': 49.0, '12-31': 39.0, '01-01': 8.0, '01-02': 40.0, '01-03': 51.0, '01-04': 47.0, '01-05': 34.0, '01-06': 27.0, '01-07': 57.0, '01-08': 43.0, '01-09': 34.0, '01-10': 56.0, '01-11': 57.0, '01-12': 58.0, '01-13': 62.0, '01-14': 45.0, '01-15': 50.0, '01-16': 57.0, '01-17': 62.0, '01-18': 58.0, '01-19': 31.0, '01-20': 39.0, '01-21': 36.0}
{'12-15': 54.0, '12-16': 52.0, '12-17': 44.0, '12-18': 45.0, '12-19': 61.0, '12-20': 60.0, '12-21': 63.0, '12-22': 64.0, '12-23': 62.0, '12-24': 52.0, '12-25': 55.0, '12-26': 53.0, '12-27': 48.0, '12-28': 33.0, '12-29': 33.0, '12-30': 49.0, '12-31': 39.0, '01-01': 8.0, '01-02': 40.0, '01-03': 51.0, '01-04': 47.0, '01-05': 34.0, '01-06': 27.0, '01-07': 57.0, '01-08': 43.0, '01-09': 34.0, '01-10': 56.0, '01-11': 57.0, '01-12': 5