# Data Engineering your stocks

In [None]:
import os
import pandas as pd
import numpy as np
import time

# Task 1: Get stock price data from API

In [None]:
def get_stock_prices(stock: str, apikey: str):
    api_result = get_stock_API(stock, apikey)
    output = transform_api_output(api_result)
    output['ticker'] = stock
    output['close'] = output['close'].astype(float)
    return output

In [None]:
import requests
def get_stock_API(stock: str, apikey: str):
    API_URL = "https://www.alphavantage.co/query"
    data = {
        "function": "TIME_SERIES_DAILY",
        "symbol": stock,
        "outputsize": "compact",
        "datatype": "csv",
        "apikey": apikey,
    }
    response = requests.get(API_URL, data)
    if 'Invalid API call' in response.text:
        raise KeyError("Ticker not valid!")
    while '5 calls per minute' in response.text:
        time.sleep(60)
        response = requests.get(API_URL, data)
    return response.text

In [None]:
api_result = get_stock_API('AAPL', os.environ['API_KEY'])
api_result

### Your code

In [None]:
def transform_api_output(api_result: str):
    return pd.DataFrame()

Test your code

In [None]:
result = transform_api_output(api_result)
print(f"Check 1: Actual shape: {result.shape}, expected shape: {(100,6)}")
print(f"Check 2: Actual columns: {list(result.columns)}, expected columns: {['timestamp', 'open', 'high', 'low', 'close', 'volume']}")

Check [Dashboard](http://127.0.0.1:8505)

# Task 2: Buffering Data

In [None]:
def get_or_load(stock: str, apikey: str):
    # Define folder, filename and file path
    file_name = f'{stock}.csv'
    data_dir = 'data/current/'
    file_path = f'{data_dir}{file_name}'
    
    # Check if stock already in data lake
    if file_name in os.listdir(data_dir):
        data = pd.read_csv(file_path, index_col=0)
        return data
    
    # If stock not found locally, return API call
    data = get_stock_prices(stock, apikey)
    if 'timestamp' in data.columns:
        data.to_csv(file_path)
        return data
    else:
        raise KeyError('Column timestamp not in dataframe!')
        
    

In [None]:
get_or_load('AAPL', os.environ['API_KEY'])

### Your code

# Task 3: Get and clean historical data

In [None]:
from utils import get_historical_data
def get_stock_timeline(stock: str, apikey: str):
    current_data = get_or_load(stock, apikey)
    
    hist_data_raw = get_historical_data(stock, apikey)
    # TODO: define transformation function for historical data
    hist_data = transform_hist_data(hist_data_raw)
    
    output = pd.concat([current_data,hist_data])
    output['close'] = output['close'].astype(float)
    return output

In [None]:
historical_prices = get_historical_data('AAPL', os.environ['API_KEY'])
historical_prices

In [None]:
get_or_load('AAPL', os.environ['API_KEY'])

### Your code

In [None]:
def transform_hist_data(df: pd.DataFrame):
    output = df
    return output

Test your code

In [None]:
result = transform_hist_data(historical_prices)
result_current = get_or_load('AAPL', os.environ['API_KEY'])
print(f"Check 1: Actual close mean: {result.close.mean()}, expected mean: ~{174}")
print(f"Check 2: Actual date format: {result.timestamp.to_list()[-1]}, expected date format: 1999-11-01")

Check [Dashboard](http://127.0.0.1:8505)

# Task 4: Get stock prices for multiple stocks

In [None]:
from pytickersymbols import PyTickerSymbols
def get_ticker_symbols():
    stock_data = PyTickerSymbols()
    stocks = stock_data.get_sp_100_nyc_yahoo_tickers()
    return stocks

In [None]:
get_ticker_symbols()[:10]

In [None]:
def get_stock_timeline(stock: str, apikey: str):
    current_data = get_or_load(stock, apikey)
    
    hist_data_raw = get_historical_data(stock, apikey)
    # TODO: define transformation function for historical data
    hist_data = transform_hist_data(hist_data_raw)
    
    output = pd.concat([current_data, hist_data])
    return output

### Your code

In [None]:
from typing import List 

def get_stocks(stocks: List[str], apikey: str) -> pd.DataFrame:
    # TODO: Expand the output to multiple stocks, use get_stock_timeline
    return pd.DataFrame()

Test your code

In [None]:
result = get_stocks(['AAPL', 'TSLA'], os.environ['API_KEY'])
if 'ticker' in result.columns:
    ticker = list(result.ticker.unique())
else:
    ticker = []
print(f"Check 1: Actual stocks {ticker}, expected stocks: {['AAPL', 'TSLA']}")

Check [Dashboard](http://127.0.0.1:8505)

# Task 5: Feature Engineering
## Lets add additonal features such as stock performance

In [None]:
df = get_or_load('AAPL', os.environ['API_KEY'])

### Your code

In [None]:
def get_performance(df: pd.DataFrame):
    return df

Test your code

In [None]:
result = get_performance(df)
print(f"Check 2: Actual columns: {list(result.columns)}, expected columns: {['timestamp', 'open', 'high', 'low', 'close', 'volume', 'ticker', 'performance']}")

Check [Dashboard](http://127.0.0.1:8505)