In [1]:
import pandas as pd
import sqlite3

In [2]:
conn = sqlite3.connect('/Users/orestis/MyProjects/InvestorAPI/app/database/ibd.db')
query = "SELECT * FROM price_prediction_dataset ORDER BY DATE(fiscal_date_ending)"

stocks_df = pd.read_sql(query, conn)

In [3]:
stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58504 entries, 0 to 58503
Data columns (total 98 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   symbol                                                               58504 non-null  object 
 1   fiscal_date_ending                                                   58504 non-null  object 
 2   reported_currency                                                    58504 non-null  object 
 3   gross_profit                                                         58504 non-null  float64
 4   total_revenue                                                        58504 non-null  float64
 5   cost_of_revenue                                                      58504 non-null  float64
 6   cost_of_goods_and_services_sold                                      58504 non-null  float64
 7   oper

## Drop rows with null values

In [4]:
columns_with_null = stocks_df.columns[stocks_df.isna().any()].tolist()
stocks_df.dropna(subset=columns_with_null, inplace=True)
stocks_df.reset_index(inplace=True)

In [5]:
stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57105 entries, 0 to 57104
Data columns (total 99 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   index                                                                57105 non-null  int64  
 1   symbol                                                               57105 non-null  object 
 2   fiscal_date_ending                                                   57105 non-null  object 
 3   reported_currency                                                    57105 non-null  object 
 4   gross_profit                                                         57105 non-null  float64
 5   total_revenue                                                        57105 non-null  float64
 6   cost_of_revenue                                                      57105 non-null  float64
 7   cost

## Split the data in training and test set

In [6]:
from typing import Tuple
import datetime as dt

def split_data_to_train_and_test(
    df: pd.DataFrame,
    cutoff_date: dt.datetime,
    cutoff_date_column_name: str = "fiscal_date_ending"
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns (train_set_df, test_set_df)
    """
    df['DateColumn'] = pd.to_datetime(df['fiscal_date_ending'])
    # Split the data into train and test based on the cutoff date
    train_set = df[df['DateColumn'] < cutoff_date].copy()
    test_set = df[df['DateColumn'] >= cutoff_date].copy()

    train_set.drop(['DateColumn',], axis=1, inplace=True)
    test_set.drop(['DateColumn',], axis=1, inplace=True)
    
    train_set = train_set.reset_index(drop=True)
    test_set = test_set.reset_index(drop=True)

    return train_set, test_set

In [7]:
train_set, test_set = split_data_to_train_and_test(df=stocks_df.copy(), cutoff_date=dt.datetime(2023, 6, 1))

In [36]:
train_set.drop(['index', 'symbol', 'fiscal_date_ending', 'reported_currency' ], axis=1, inplace=True)
test_set.drop(['index', 'symbol', 'fiscal_date_ending', 'reported_currency' ], axis=1, inplace=True)

In [37]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53947 entries, 0 to 53946
Data columns (total 95 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   gross_profit                                                         53947 non-null  float64
 1   total_revenue                                                        53947 non-null  float64
 2   cost_of_revenue                                                      53947 non-null  float64
 3   cost_of_goods_and_services_sold                                      53947 non-null  float64
 4   operating_income                                                     53947 non-null  float64
 5   selling_general_and_administrative                                   53947 non-null  float64
 6   research_and_development                                             53947 non-null  float64
 7   oper

In [38]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3158 entries, 0 to 3157
Data columns (total 95 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   gross_profit                                                         3158 non-null   float64
 1   total_revenue                                                        3158 non-null   float64
 2   cost_of_revenue                                                      3158 non-null   float64
 3   cost_of_goods_and_services_sold                                      3158 non-null   float64
 4   operating_income                                                     3158 non-null   float64
 5   selling_general_and_administrative                                   3158 non-null   float64
 6   research_and_development                                             3158 non-null   float64
 7   operat

In [39]:
from typing import List, Optional

from sklearn.preprocessing import OneHotEncoder

def perform_one_hot_encoding(
    df: pd.DataFrame,
    encoder: OneHotEncoder,
    fit: bool = False,
    categorical_columns: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Given a dataframe with categorical columns this function will return a new dataframe
    that will replace these columns with the one hot encoding ones and will leave the rest
    of the columns unchanged.
    params:
    - fit -> If fit is True then fit_transform is called , otherwise only transform
    - categorical_columns -> A list with the names of the categorical columns we want to
    transform. If not given then columns with type category are used
    """

    if categorical_columns is None:
        categorical_columns = df.select_dtypes(include='category').columns.tolist()

    if fit: 
        encoded_df = pd.DataFrame(encoder.fit_transform(df[categorical_columns]).toarray(), columns=encoder.get_feature_names_out())
    else:
        encoded_df = pd.DataFrame(encoder.transform(df[categorical_columns]).toarray(), columns=encoder.get_feature_names_out())

    #merge one-hot encoded columns back with original DataFrame
    final_df = df.join(encoded_df)

    # Drop original categorical columns
    final_df.drop(categorical_columns, axis=1, inplace=True)

    return final_df.reset_index(drop=True)


In [40]:
from typing import List, Optional

from sklearn.preprocessing import MinMaxScaler

def perform_min_max_scaling(
    df: pd.DataFrame,
    min_max_scaler: MinMaxScaler,
    fit: bool = False,
    columns_to_scale: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Given a dataframe with numerical columns this function will return a new dataframe
    that will perform min max scaling on these columns and will leave the rest
    of the columns unchanged.
    params:
    - fit -> If fit is True then fit_transform is called , otherwise only transform
    - columns_to_scale -> A list with the names of the numerical columns we want to
    transform. If not given then columns with type float64 are used
    """

    if columns_to_scale is None:
        columns_to_scale = df.select_dtypes(include='float64').columns.tolist()

    scaled_df = df.copy()
    # Scale the specified columns
    if fit:
        scaled_df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    else:
        scaled_df[columns_to_scale] = scaler.transform(df[columns_to_scale])

    return scaled_df

In [26]:
float_columns = stocks_df.select_dtypes(include=['float64'])

# Get the column names of the selected columns
columns_to_scale = list(float_columns.columns)
# Remove target variable from the columns to scale
columns_to_scale.remove('price')
columns_to_scale

['gross_profit',
 'total_revenue',
 'cost_of_revenue',
 'cost_of_goods_and_services_sold',
 'operating_income',
 'selling_general_and_administrative',
 'research_and_development',
 'operating_expenses',
 'investment_income_net',
 'net_interest_income',
 'interest_income',
 'interest_expense',
 'non_interest_income',
 'other_non_operating_income',
 'depreciation',
 'depreciation_and_amortization',
 'income_before_tax',
 'income_tax_expense',
 'interest_and_debt_expense',
 'net_income_from_continuing_operations',
 'comprehensive_income_net_of_tax',
 'ebit',
 'ebitda',
 'net_income',
 'total_assets',
 'total_current_assets',
 'cash_and_cash_equivalents_at_carrying_value',
 'cash_and_short_term_investments',
 'inventory',
 'current_net_receivables',
 'total_non_current_assets',
 'property_plant_equipment',
 'accumulated_depreciation_amortization_ppe',
 'intangible_assets',
 'intangible_assets_excluding_goodwill',
 'goodwill',
 'investments',
 'long_term_investments',
 'short_term_investmen

In [22]:
categorical_columns = ['sector']

In [41]:
encoder = OneHotEncoder()
one_hot_encoded_train_set = perform_one_hot_encoding(
    df=train_set,
    categorical_columns=categorical_columns,
    encoder=encoder,
    fit=True
)

In [42]:
one_hot_encoded_train_set

Unnamed: 0,gross_profit,total_revenue,cost_of_revenue,cost_of_goods_and_services_sold,operating_income,selling_general_and_administrative,research_and_development,operating_expenses,investment_income_net,net_interest_income,...,avg_global_commodities_index_value,inflation,price,sector_ENERGY & TRANSPORTATION,sector_FINANCE,sector_LIFE SCIENCES,sector_MANUFACTURING,sector_REAL ESTATE & CONSTRUCTION,sector_TECHNOLOGY,sector_TRADE & SERVICES
0,2.417031e+06,5.689043e+06,3.272012e+06,3.272012e+06,1658477.0,795212.0,0.0,794643.0,0.0,-26922.0,...,175.032280,3.156842,1.750000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.686513e+06,5.284750e+06,3.598237e+06,3.598237e+06,-269682.0,1992276.0,0.0,2725727.0,0.0,-28221.0,...,181.335364,3.156842,2.196667,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.241275e+06,3.315931e+06,2.074656e+06,2.074656e+06,454820.0,809392.0,0.0,1036063.0,0.0,-27163.0,...,171.818546,2.069337,2.100000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.740468e+06,6.983128e+06,4.242660e+06,4.242660e+06,1928212.0,901732.0,0.0,832032.0,0.0,-37741.0,...,170.295168,2.069337,1.776667,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2.067378e+06,5.529120e+06,3.461742e+06,3.461742e+06,1411658.0,749865.0,0.0,888682.0,0.0,-43061.0,...,170.093088,2.069337,1.760000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53942,1.582193e+07,1.674173e+07,9.197960e+05,-3.463000e+10,-2851160.0,830728.0,0.0,18673093.0,29557786.0,-1420162.0,...,157.781959,4.642850,26.935000,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53943,5.322100e+07,1.645530e+08,1.113320e+08,1.113320e+08,26957000.0,21742000.0,4522000.0,26264000.0,680000.0,-1225000.0,...,157.781959,4.642850,125.935000,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53944,7.839400e+07,1.864520e+08,1.080580e+08,5.420300e+07,-39629000.0,41113000.0,25899000.0,78652000.0,0.0,921000.0,...,157.781959,4.642850,14.245000,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53945,1.231820e+08,4.711910e+08,2.373170e+08,1.186280e+08,23425000.0,92703000.0,7054000.0,99757000.0,0.0,-9854000.0,...,157.781959,4.642850,22.470000,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [43]:
scaler = MinMaxScaler()
scaled_train_set = perform_min_max_scaling(
    df=one_hot_encoded_train_set,
    min_max_scaler=scaler,
    fit=True,
    columns_to_scale=columns_to_scale
)

In [44]:
scaled_train_set

Unnamed: 0,gross_profit,total_revenue,cost_of_revenue,cost_of_goods_and_services_sold,operating_income,selling_general_and_administrative,research_and_development,operating_expenses,investment_income_net,net_interest_income,...,avg_global_commodities_index_value,inflation,price,sector_ENERGY & TRANSPORTATION,sector_FINANCE,sector_LIFE SCIENCES,sector_MANUFACTURING,sector_REAL ESTATE & CONSTRUCTION,sector_TECHNOLOGY,sector_TRADE & SERVICES
0,0.271768,0.001661,5.811489e-07,5.811489e-07,0.391889,0.000286,0.000242,0.305207,0.593343,0.326890,...,0.588252,0.385356,1.750000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.271768,0.001661,5.811493e-07,5.811493e-07,0.391889,0.000286,0.000242,0.305207,0.593343,0.326890,...,0.631043,0.385356,2.196667,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.271768,0.001661,5.811477e-07,5.811477e-07,0.391889,0.000286,0.000242,0.305207,0.593343,0.326890,...,0.566434,0.247421,2.100000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.271768,0.001661,5.811499e-07,5.811499e-07,0.391889,0.000286,0.000242,0.305207,0.593343,0.326890,...,0.556092,0.247421,1.776667,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.271768,0.001661,5.811491e-07,5.811491e-07,0.391889,0.000286,0.000242,0.305207,0.593343,0.326890,...,0.554720,0.247421,1.760000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53942,0.271768,0.001661,5.811466e-07,5.465157e-07,0.391889,0.000286,0.000242,0.305208,0.596195,0.326890,...,0.471141,0.573836,26.935000,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53943,0.271769,0.001665,5.812570e-07,5.812570e-07,0.391890,0.000286,0.000243,0.305208,0.593409,0.326890,...,0.471141,0.573836,125.935000,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53944,0.271770,0.001665,5.812537e-07,5.811999e-07,0.391888,0.000286,0.000247,0.305210,0.593343,0.326890,...,0.471141,0.573836,14.245000,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53945,0.271771,0.001672,5.813830e-07,5.812643e-07,0.391890,0.000286,0.000243,0.305211,0.593343,0.326889,...,0.471141,0.573836,22.470000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
