# Data Engineering your stocks

In [None]:
%%capture
!pip install -r requirements.txt

In [10]:
import os
import pandas as pd
import numpy as np
from utils import get_or_load, get_stocks, get_ticker_symbols, get_performance, get_stock_timeline

# Task 1: Get stock price data from API

In [17]:
def get_stock_prices(stock: str, apikey: str):
    api_result = get_stock_API(stock, apikey)
    output = transform_api_output(api_result)
    output['ticker'] = stock
    output['close'] = output['close'].astype(float)
    return output

In [18]:
import requests
def get_stock_API(stock: str, apikey: str):
    API_URL = "https://www.alphavantage.co/query"
    data = {
        "function": "TIME_SERIES_DAILY",
        "symbol": stock,
        "outputsize": "compact",
        "datatype": "csv",
        "apikey": apikey,
    }
    response = requests.get(API_URL, data)
    if 'Invalid API call' in response.text:
        raise KeyError("Ticker not valid!")
    while '5 calls per minute' in response.text:
        time.sleep(60)
        response = requests.get(API_URL, data)
    return response.text

In [None]:
get_stock_API('AAPL', os.environ['API_KEY'])

### Your code

In [8]:
def transform_api_output(api_result: str):
    # TODO: Transform code here
    return pd.DataFrame()

Test your code

In [11]:
result = transform_api_output(get_stock_API('AAPL', os.environ['API_KEY']))
print(f"Check 1: Actual shape: {result.shape}, expected shape: {(100,6)}")
print(f"Check 2: Actual columns: {list(result.columns)}, expected columns: {['timestamp', 'open', 'high', 'low', 'close', 'volume']}")

Check 1: Actual shape: (0, 0), expected shape: (100, 6)
Check 2: Actual columns: [], expected columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume']


# Task 2: Buffering Data

In [15]:
def get_or_load(stock: str, apikey: str):
    # Define folder, filename and file path
    file_name = f'{stock}.csv'
    data_dir = 'data/current/'
    file_path = f'{data_dir}{file_name}'
    
    # Check if stock already in data lake
    if file_name in os.listdir(data_dir):
        data = pd.read_csv(file_path)
        return data
    
    # If stock not found locally, return API call
    data = get_stock_prices(stock, apikey)
    if 'timestamp' in data.columns:
        data.to_csv(file_path)
        return data
    else:
        raise KeyError('Column timestamp not in dataframe!')
        
    

In [16]:
get_or_load('AAPL', os.environ['API_KEY'])

KeyError: 'close'

### Your code

# Task 3: Get and clean historical data

In [19]:
from hints import get_historical_data
def get_stock_timeline(stock: str, apikey: str):
    current_data = get_or_load(stock, apikey)
    
    hist_data_raw = get_historical_data(stock, apikey)
    # TODO: define transformation function for historical data
    hist_data = transform_hist_data(hist_data_raw)
    
    output = pd.concat([current_data,hist_data])
    return output

In [21]:
historical_prices = get_historical_data('AAPL', os.environ['API_KEY'])
historical_prices

Unnamed: 0,timestamp,open,high,low,close,volume,ticker
100,2022/05/31,149.0700,150.6600,146.8400,14884.0,103718416,AAPL
101,2022/05/27,145.3900,149.6800,145.2600,14964.0,90978503,AAPL
102,2022/05/26,137.3900,144.3400,137.1400,14378.0,90601548,AAPL
103,2022/05/25,138.4300,141.7850,138.3400,14052.0,92482696,AAPL
104,2022/05/24,140.8050,141.9700,137.3300,14036.0,104132746,AAPL
...,...,...,...,...,...,...,...
5777,1999/11/05,84.6200,88.3700,84.0000,8831.0,3721500,AAPL
5778,1999/11/04,82.0600,85.3700,80.6200,8362.0,3384700,AAPL
5779,1999/11/03,81.6200,83.2500,81.0000,8150.0,2932700,AAPL
5780,1999/11/02,78.0000,81.6900,77.3100,8025.0,3564600,AAPL


In [22]:
get_or_load('AAPL', os.environ['API_KEY'])

KeyError: 'close'

### Your code

In [None]:
def transform_hist_data(df: pd.DataFrame):
    output = df

    return output

Test your code

In [None]:
result = transform_hist_data(historical_prices)
result_current = get_or_load('AAPL', os.environ['API_KEY'])
print(f"Check 1: Actual close mean: {result.close.mean()}, expected mean: ~{174}")
print(f"Check 2: Actual date format: {result.timestamp.to_list()[-1]}, expected date format: 1999-11-01")

# Task 4: Get stock prices for multiple stocks

In [23]:
from pytickersymbols import PyTickerSymbols
def get_ticker_symbols():
    stock_data = PyTickerSymbols()
    stocks = stock_data.get_sp_100_nyc_yahoo_tickers()
    return stocks

In [24]:
get_ticker_symbols()[:10]

['LIN', 'MMM', 'AXP', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DD']

In [None]:
def get_stock_timeline(stock: str, apikey: str):
    current_data = get_or_load(stock, apikey)
    
    hist_data_raw = get_historical_data(stock, apikey)
    # TODO: define transformation function for historical data
    hist_data = transform_hist_data(hist_data_raw)
    
    output = pd.concat([current_data, hist_data])
    return output

### Your code

In [25]:
from typing import List 

def get_stocks(stocks: List[str], apikey: str) -> pd.DataFrame:
    # TODO: Expand the output to multiple stocks, use get_stock_timeline
    return pd.DataFrame()

Check 1: Actual stocks [], expected stocks: ['AAPL', 'TSLA']


Test your code

In [None]:
result = get_stocks(['AAPL', 'TSLA'], os.environ['API_KEY'])
if 'ticker' in result.columns:
    ticker = list(result.ticker.unique())
else:
    ticker = []
print(f"Check 1: Actual stocks {ticker}, expected stocks: {['AAPL', 'TSLA']}")

# Task 5: Feature Engineering
## Lets add additonal features such as stock performance

In [None]:
get_or_load('AAPL', os.environ['API_KEY'])

### Your code

In [None]:
def get_performance(df: pd.DataFrame):
    return df

Test your code

In [None]:
result = get_performance(df)
print(f"Check 2: Actual columns: {list(result.columns)}, expected columns: {['timestamp', 'open', 'high', 'low', 'close', 'volume', 'ticker', 'performance']}")