In [1]:
import sqlite3
from typing import Optional

import pandas as pd

## Create the dataframe which contains all the historical fundamental data of a stock

In [2]:
# Create stocks fundamental df
conn = sqlite3.connect('../app/database/ibd.db')

In [3]:
def get_stock_fundamental_df(symbol: str) -> pd.DataFrame:
    query = f'''
    SELECT income_statement.*, balance_sheet.*, cash_flow.*
    FROM income_statement
    INNER JOIN balance_sheet
    ON income_statement.fiscal_date_ending = balance_sheet.fiscal_date_ending AND balance_sheet.symbol = '{symbol}'
    INNER JOIN cash_flow
    ON income_statement.fiscal_date_ending = cash_flow.fiscal_date_ending AND cash_flow.symbol = '{symbol}'
    WHERE income_statement.symbol = '{symbol}'
    '''
    stock_df = pd.read_sql(query, conn)
    
    # Drop columns with duplicated names
    stock_df = stock_df.loc[:, ~stock_df.columns.duplicated()]
    return stock_df

In [4]:
stock_df = get_stock_fundamental_df('NVDA')
stock_df.head()

Unnamed: 0,symbol,fiscal_date_ending,reported_currency,gross_profit,total_revenue,cost_of_revenue,cost_of_goods_and_services_sold,operating_income,selling_general_and_administrative,research_and_development,...,dividend_payout,dividend_payout_common_stock,dividend_payout_preferred_stock,proceeds_from_issuance_of_common_stock,proceeds_from_issuance_of_long_term_debt_and_capital_securities_net,proceeds_from_issuance_of_preferred_stock,proceeds_from_repurchase_of_equity,proceeds_from_sale_of_treasury_stock,change_in_cash_and_cash_equivalents,change_in_exchange_rate
0,NVDA,2023-07-30,USD,9462000000.0,13507000000.0,4045000000.0,343000000.0,6800000000.0,622000000.0,2040000000.0,...,99000000.0,99000000.0,,,,,0.0,,803000000.0,
1,NVDA,2023-04-30,USD,4648000000.0,7192000000.0,2544000000.0,2544000000.0,2140000000.0,633000000.0,1875000000.0,...,99000000.0,99000000.0,,,,,0.0,,1690000000.0,
2,NVDA,2023-01-29,USD,3833000000.0,6051000000.0,2218000000.0,2218000000.0,1256000000.0,625000000.0,1952000000.0,...,98000000.0,98000000.0,,0.0,0.0,0.0,-1213000000.0,,,
3,NVDA,2022-10-30,USD,3177000000.0,5931000000.0,2754000000.0,2754000000.0,601000000.0,631000000.0,1945000000.0,...,100000000.0,100000000.0,,,,,-3485000000.0,,,
4,NVDA,2022-07-31,USD,2915000000.0,6704000000.0,3789000000.0,570000000.0,499000000.0,592000000.0,1824000000.0,...,100000000.0,100000000.0,,0.0,0.0,0.0,0.0,,,


## Create dataframe that contains interest rates historical data

In [5]:
# Create the interest rates df
query = '''
SELECT  *
FROM economic_indicator_time_series
WHERE indicator_name = 'Interest_Rate'
'''

interest_rate_df = pd.read_sql(query, conn)
interest_rate_df

Unnamed: 0,indicator_name,value,unit,registered_date,registered_date_ts
0,Interest_Rate,5.12,percent,01-07-2023,1688158800
1,Interest_Rate,5.08,percent,01-06-2023,1685566800
2,Interest_Rate,5.06,percent,01-05-2023,1682888400
3,Interest_Rate,4.83,percent,01-04-2023,1680296400
4,Interest_Rate,4.65,percent,01-03-2023,1677621600
...,...,...,...,...,...
824,Interest_Rate,0.83,percent,01-11-1954,-478663200
825,Interest_Rate,0.85,percent,01-10-1954,-481341600
826,Interest_Rate,1.07,percent,01-09-1954,-483933600
827,Interest_Rate,1.22,percent,01-08-1954,-486612000


## Create dataframe that contains treasury yield(10 year) historical data

In [6]:
query = '''
SELECT  *
FROM economic_indicator_time_series
WHERE indicator_name = 'Treasury_Yield'
'''

treasury_yield_df = pd.read_sql(query, conn)
treasury_yield_df

Unnamed: 0,indicator_name,value,unit,registered_date,registered_date_ts
0,Treasury_Yield,3.90,percent,01-07-2023,1688158800
1,Treasury_Yield,3.75,percent,01-06-2023,1685566800
2,Treasury_Yield,3.57,percent,01-05-2023,1682888400
3,Treasury_Yield,3.46,percent,01-04-2023,1680296400
4,Treasury_Yield,3.66,percent,01-03-2023,1677621600
...,...,...,...,...,...
839,Treasury_Yield,2.95,percent,01-08-1953,-518148000
840,Treasury_Yield,2.93,percent,01-07-1953,-520826400
841,Treasury_Yield,3.11,percent,01-06-1953,-523418400
842,Treasury_Yield,3.05,percent,01-05-1953,-526096800


## Create dataframe that contains global commodities index historical data

In [7]:
query = '''
SELECT  *
FROM economic_indicator_time_series
WHERE indicator_name = 'Global_Commodities_Index'
'''

commodities_index_df = pd.read_sql(query, conn)
commodities_index_df

Unnamed: 0,indicator_name,value,unit,registered_date,registered_date_ts
0,Global_Commodities_Index,154.695420,index 2016=100,01-06-2023,1685566800
1,Global_Commodities_Index,157.292251,index 2016=100,01-05-2023,1682888400
2,Global_Commodities_Index,170.972547,index 2016=100,01-04-2023,1680296400
3,Global_Commodities_Index,168.294185,index 2016=100,01-03-2023,1677621600
4,Global_Commodities_Index,174.564801,index 2016=100,01-02-2023,1675202400
...,...,...,...,...,...
241,Global_Commodities_Index,63.065977,index 2016=100,01-05-2003,1051736400
242,Global_Commodities_Index,61.887199,index 2016=100,01-04-2003,1049144400
243,Global_Commodities_Index,65.782768,index 2016=100,01-03-2003,1046469600
244,Global_Commodities_Index,70.229514,index 2016=100,01-02-2003,1044050400


## Create dataframe that contains Unemployment historical data

In [8]:
query = '''
SELECT  *
FROM economic_indicator_time_series
WHERE indicator_name = 'Unemployment'
'''

unemployment_df = pd.read_sql(query, conn)
unemployment_df

Unnamed: 0,indicator_name,value,unit,registered_date,registered_date_ts
0,Unemployment,3.5,percent,01-07-2023,1688158800
1,Unemployment,3.6,percent,01-06-2023,1685566800
2,Unemployment,3.7,percent,01-05-2023,1682888400
3,Unemployment,3.4,percent,01-04-2023,1680296400
4,Unemployment,3.5,percent,01-03-2023,1677621600
...,...,...,...,...,...
902,Unemployment,3.5,percent,01-05-1948,-683863200
903,Unemployment,3.9,percent,01-04-1948,-686455200
904,Unemployment,4.0,percent,01-03-1948,-689133600
905,Unemployment,3.8,percent,01-02-1948,-691639200


## Create dataframe that contains Inflation historical data(yearly)

In [9]:
query = '''
SELECT  *
FROM economic_indicator_time_series
WHERE indicator_name = 'Inflation'
ORDER BY registered_date_ts DESC
'''

inflation_df = pd.read_sql(query, conn)
inflation_df

Unnamed: 0,indicator_name,value,unit,registered_date,registered_date_ts
0,Inflation,4.642850,percent,01-01-2023,1693083600
1,Inflation,8.002800,percent,01-01-2022,1640988000
2,Inflation,4.697859,percent,01-01-2021,1609452000
3,Inflation,1.233584,percent,01-01-2020,1577829600
4,Inflation,1.812210,percent,01-01-2019,1546293600
...,...,...,...,...,...
59,Inflation,1.278912,percent,01-01-1964,-189396000
60,Inflation,1.239669,percent,01-01-1963,-220932000
61,Inflation,1.198773,percent,01-01-1962,-252468000
62,Inflation,1.070724,percent,01-01-1961,-284004000


## Create dataframe that contains the stock time series

In [10]:
def get_stock_time_series_df(symbol: str) -> pd.DataFrame:
    query = f'''
    SELECT  *
    FROM stock_time_series
    WHERE symbol = '{symbol}'
    ORDER BY registered_date_ts DESC
    '''
    
    stock_time_series_df = pd.read_sql(query, conn)
    return stock_time_series_df

In [11]:
stock_time_series_df = get_stock_time_series_df('NVDA')
stock_time_series_df

Unnamed: 0,symbol,open_price,high_price,low_price,close_price,volume,dividend_amount,registered_date,registered_date_ts
0,NVDA,497.620,498.00,481.416,485.09,4.638303e+07,0.00,01-09-2023,1693515600
1,NVDA,464.595,502.66,403.110,493.55,1.363638e+09,0.00,31-08-2023,1693429200
2,NVDA,425.170,480.88,413.460,467.29,8.709012e+08,0.00,31-07-2023,1690750800
3,NVDA,384.890,439.90,373.560,423.02,1.052802e+09,0.04,30-06-2023,1688072400
4,NVDA,278.400,419.38,272.400,378.34,1.169407e+09,0.00,31-05-2023,1685480400
...,...,...,...,...,...,...,...,...,...
281,NVDA,83.720,110.40,57.000,89.13,1.350360e+07,0.00,28-04-2000,956869200
282,NVDA,64.000,150.00,57.500,84.48,3.656700e+07,0.00,31-03-2000,954450000
283,NVDA,37.130,68.25,35.750,64.00,1.908510e+07,0.00,29-02-2000,951775200
284,NVDA,47.250,48.25,35.000,37.06,1.029470e+07,0.00,31-01-2000,949269600


## Helper function to calculate average value from given time series in a given timeframe

In [12]:
def calculate_time_series_avg_value(
    start_date: str,
    time_series_df: pd.DataFrame,
    target_column: str,
    days: int = 31
) -> Optional[int]:
    """
    Given a start calculate what was the avg value
    between <start_date> and <start_date> + <days> time
    """
    lower_bound = pd.Timestamp(start_date)
    
    upper_bound = lower_bound + pd.DateOffset(days=days)
    
    # Filter the DataFrame
    filtered_df = time_series_df[(time_series_df['registered_date_ts'] >= lower_bound.timestamp()) & (time_series_df['registered_date_ts'] <= upper_bound.timestamp())]
    
    if len(filtered_df) == 0:
        return None

    average_value = filtered_df[target_column].mean()
    return average_value

## Helper function to return inflation of given year

In [13]:
def get_inflation_value_by_date(date_string: str, inflation_df: pd.DataFrame) -> Optional[float]:
    try:
        date_obj = pd.to_datetime(date_string, format='%Y-%m-%d')
        target_year = date_obj.year
        inflation_df['register_date_pandas_dt'] = pd.to_datetime(inflation_df['registered_date'], format='%d-%m-%Y')
        selected_row = inflation_df[inflation_df['register_date_pandas_dt'].dt.year == target_year]
        if not selected_row.empty:
            return selected_row['value'].iloc[0]
        else:
            return None
    except (ValueError, KeyError):
        return None

In [14]:
result = get_inflation_value_by_date('2023-12-31', inflation_df)
print("Result is:", result)

Result is: 4.64285


## Create new columns with average values of economic indicators and inflation of given year

In [15]:
temp_stock_df = stock_df.copy()
temp_stock_df['avg_interest_rate'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          calculate_time_series_avg_value,
                                                          target_column='value',
                                                          time_series_df=interest_rate_df,
                                                          days=40
                                                        )

In [16]:
temp_stock_df['avg_treasury_yield'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          calculate_time_series_avg_value,
                                                          target_column='value',
                                                          time_series_df=treasury_yield_df,
                                                          days=40
                                                        )

In [17]:
temp_stock_df['avg_unemployment_rate'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          calculate_time_series_avg_value,
                                                          target_column='value',
                                                          time_series_df=unemployment_df,
                                                          days=40
                                                        )

In [18]:
temp_stock_df['avg_global_commodities_index_value'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          calculate_time_series_avg_value,
                                                          target_column='value',
                                                          time_series_df=commodities_index_df,
                                                          days=40
                                                        )

In [19]:
temp_stock_df['inflation'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          get_inflation_value_by_date,
                                                          inflation_df=inflation_df,
                                                        )

## Create volume column

In [20]:
temp_stock_df['volume'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          calculate_time_series_avg_value,
                                                          target_column='volume',
                                                          time_series_df=stock_time_series_df,
                                                          days=40
                                                        )

## Create target column

In [21]:
temp_stock_df['price'] = temp_stock_df['fiscal_date_ending'].apply(
                                                          calculate_time_series_avg_value,
                                                          target_column='close_price',
                                                          time_series_df=stock_time_series_df,
                                                          days=40
                                                        )

In [22]:
temp_stock_df.head()

Unnamed: 0,symbol,fiscal_date_ending,reported_currency,gross_profit,total_revenue,cost_of_revenue,cost_of_goods_and_services_sold,operating_income,selling_general_and_administrative,research_and_development,...,proceeds_from_sale_of_treasury_stock,change_in_cash_and_cash_equivalents,change_in_exchange_rate,avg_interest_rate,avg_treasury_yield,avg_unemployment_rate,avg_global_commodities_index_value,inflation,volume,price
0,NVDA,2023-07-30,USD,9462000000.0,13507000000.0,4045000000.0,343000000.0,6800000000.0,622000000.0,2040000000.0,...,,803000000.0,,,,,,4.64285,760307500.0,481.976667
1,NVDA,2023-04-30,USD,4648000000.0,7192000000.0,2544000000.0,2544000000.0,2140000000.0,633000000.0,1875000000.0,...,,1690000000.0,,5.07,3.66,3.65,155.993835,4.64285,1169407000.0,378.34
2,NVDA,2023-01-29,USD,3833000000.0,6051000000.0,2218000000.0,2218000000.0,1256000000.0,625000000.0,1952000000.0,...,,,,4.61,3.705,3.55,171.429493,4.64285,992452400.0,213.765
3,NVDA,2022-10-30,USD,3177000000.0,5931000000.0,2754000000.0,2754000000.0,601000000.0,631000000.0,1945000000.0,...,,,,3.94,3.755,3.55,193.215259,8.0028,1157644000.0,152.1
4,NVDA,2022-07-31,USD,2915000000.0,6704000000.0,3789000000.0,570000000.0,499000000.0,592000000.0,1824000000.0,...,,,,2.445,3.21,3.6,230.968689,8.0028,1206915000.0,150.94


In [23]:
stock_time_series_df.head(10)

Unnamed: 0,symbol,open_price,high_price,low_price,close_price,volume,dividend_amount,registered_date,registered_date_ts
0,NVDA,497.62,498.0,481.416,485.09,46383030.0,0.0,01-09-2023,1693515600
1,NVDA,464.595,502.66,403.11,493.55,1363638000.0,0.0,31-08-2023,1693429200
2,NVDA,425.17,480.88,413.46,467.29,870901200.0,0.0,31-07-2023,1690750800
3,NVDA,384.89,439.9,373.56,423.02,1052802000.0,0.04,30-06-2023,1688072400
4,NVDA,278.4,419.38,272.4,378.34,1169407000.0,0.0,31-05-2023,1685480400
5,NVDA,275.09,281.1,262.2,277.49,742662500.0,0.0,28-04-2023,1682629200
6,NVDA,231.92,278.3431,222.97,277.77,1126658000.0,0.04,31-03-2023,1680210000
7,NVDA,196.91,238.88,196.11,232.16,1039409000.0,0.0,28-02-2023,1677535200
8,NVDA,148.51,206.28,140.34,195.37,945495800.0,0.0,31-01-2023,1675116000
9,NVDA,169.99,187.9,138.84,146.14,894615100.0,0.0,30-12-2022,1672351200


In [24]:
temp_stock_df.to_sql('feature_store', conn, index=False, if_exists='replace')

19