## Assignment 2.2 Sourcing data with an API

In [None]:
## Import and concatenate

In [None]:
import pandas as pd 
import numpy as np
import os 
import requests
import json
from datetime import datetime

In [None]:
%%capture
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Create a list with all files in the folder using a list compehension
import glob

filepaths = glob.glob("Data/*.csv")  # Only select CSV files
filepaths

### We used a generator to list comprehensions, since we need to iterate through the files just once, and generators are capable of handling one file at a time

In [None]:
df = pd.concat((pd.read_csv(f) for f in filepaths), ignore_index=True)
df.head()

### We identify the available csv files with the data path. Then we went over the loop to read for each csv file row by row and concatenated it.

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

### Heads and Tails match, so they merged perfectly

In [None]:
df = pd.concat(
    (pd.read_csv(f, dtype={"start_station_name": str, "end_station_name": str}, low_memory=False) for f in filepaths),
    ignore_index=True
)


In [None]:
print(df.dtypes)  # Should show 'object' for start_station_name and end_station_name
print(df[['start_station_name', 'end_station_name']].head())  # Preview data


In [None]:
df.head()

In [None]:
df.tail()

## ## Get weather data using NOAA's API

In [None]:
# Define your NOAA token - type yours in here! 

Token = 'GCWhOUMGHMLILXunaHDbUZACXUbnFrOa' 
stationid="GHCND:USW00014732"

In [None]:
# Get the API 

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [None]:
# Load the api response as a json

d = json.loads(r.text)

In [None]:
# Secure all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [None]:
# Get only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]

In [None]:
# Get the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [None]:
temps[10:20]

In [None]:
# Put the results in a dataframe

df_temp = pd.DataFrame()

In [None]:
# Get only date and cast it to date time; convert temperature from tenths of Celsius to normal Celsius

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [None]:
df_temp.tail()

In [None]:
df_temp.head()

In [None]:
df.dtypes

In [None]:
df['start_time'] = pd.to_datetime(df['started_at'], dayfirst = True)

In [None]:
df['date'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d').dt.date

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df_temp.head()

In [None]:
%%time
df_merged = df.merge(df_temp, how = 'left', on = 'date', indicator = True)

In [None]:
df_merged.head()

### Average temperature and start time and date are merged.

In [None]:
df_merged['_merge'].value_counts(dropna = False)

In [None]:
df_merged.to_csv('Data/output/newyork_data.csv', index = False)

In [None]:
df.shape

In [None]:
df_temp.to_csv('Data/output/nytemperature.csv', index = False)