# Stock Prediction Model

## Dependencies

### Library Installation (if needed)

In [6]:
#! pip install yfinance pandas numpy matplotlib lightgbm torch tensorflow keras

### Importing Required Libraries

In [7]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pickle
import gradio as gr
import ast
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

### Variables

In [8]:
build_new_dataset = {'True': True, 'False': False}.get(input('Build New Dataset? (Bool)'))
if build_new_dataset:
    symbol_list = input("Symbols: ('simple', 'filtered' or 'all')") # 'simple' or 'all'. simple are the tickers from the screener notebook and all are all of the tickers in 'filtered_tickers'. You may also an 'Int' to get a % amount of random tickers from 'all'
else:
    symbol_list = 'filtered'

train_new_model = {'True': True, 'False': False}.get(input('Train New Model? (Bool)'))
if train_new_model:
    minimum_feature_threshold = 0.9
    param_grid = {
            'shuffle': [True], 
            'solver': ['adam'], 
            'learning_rate': ['adaptive'],
            'tol': [0.0001],
            'max_iter': [200],
            'alpha': [0.0001],
            'hidden_layer_sizes': [ast.literal_eval(input('Hidden Layers: eg. "(100, 100), (500, 500)"'))],
            'learning_rate_init': [0.001],
            'n_iter_no_change': [9]
        }
    verticle_jobs = 1 #'-1' for max

if build_new_dataset or train_new_model:
    debugging = {'True': True, 'False': False}.get(input('Debug? (Bool)'))
else:
    debugging = False

## Data Preparation

In [9]:
symbols = []
if symbol_list == 'filtered':
    symbols = pd.read_csv('../data/filtered_tickers.csv')['Ticker'].tolist()
elif symbol_list == 'all':
    symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
else:
    all_symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
    num_symbols = max(1, round(len(all_symbols) * (int(symbol_list) / 100)))  
    symbols = np.random.choice(all_symbols, num_symbols, replace=False).tolist()
    symbols = symbols + pd.read_csv("../data/filtered_tickers.csv")["Ticker"].tolist()

symbols = symbols + pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()

symbols = pd.Series(symbols).unique()
symbols

array(['GOF', 'BRID', 'CTWS', 'SGMA', 'MIXT', 'MJI', 'DRAM', 'MUC',
       'GFNCP', 'HME', 'ESXB', 'TNH', 'SPW', 'KYN', 'SCCO', 'DSWL', 'KWK',
       'DGIT', 'GPRC', 'NL', 'PCQ', 'STT-C', 'LDL', 'MNGA', 'RDEN',
       'AHPI', 'CLMS', 'ASDR', 'QLTY', 'FOR', 'QUAD', 'MYI', 'ABR-PB',
       'DNP', 'EEA', 'SIVB', 'TRN', 'HCN-I', 'EEME', 'PNM', 'STRZA',
       'AOS', 'LINTB', 'SQNM', 'CEMP', 'FORD', 'SBCF', 'RECN', 'BPFHW',
       'COF', 'NMY', 'KFH', 'HTH', 'GALTW', 'MZOR', 'HSIC', 'NW-C',
       'MODN', 'BWG', 'CRTX', 'NIQ', 'SCE-F', 'OUTR', 'ANGO', 'TSYS',
       'ITC', 'RMGN', 'OSHC', 'SZC', 'TCBK', 'BPFHP', 'CEMI', 'RFMD',
       'RAS', 'GGT', 'ENI', 'CPE', 'PKH', 'NMY-D', 'LKQ', 'ATR', 'VOLC',
       'WFD', 'GY', 'DXPE', 'YGE', 'HOFT', 'WRLD', 'CSGS', 'QCOR', 'PFO',
       'LRAD', 'CCG-A', 'CPSI', 'SYNM', 'CLDX', 'CAMT', 'IDCC', 'NM',
       'CHLN', 'KAMN', 'CASS', 'CBU', 'EBTC', 'DYNT', 'DRE-J', 'MSLI',
       'LFL', 'QQEW', 'MRTN', 'MASC', 'NWL', 'HP', 'GFED', 'GLCH', 'MARK',
      

#### Download annual financial data

In [10]:
if build_new_dataset:
    df = pd.DataFrame()
    filtered_pd = pd.read_csv('../data/filtered_tickers.csv')
    for symbol in tqdm(symbols):
        ticker_df = pd.DataFrame()
        ticker = yf.Ticker(symbol)
        try:
            # Latest earning data
            latest_earning_date = ticker.quarterly_cash_flow.columns.tolist()[0]
            latest_data = {'Ticker': symbol, 'Date': latest_earning_date}
            latest_data['3M Future Change'] = np.nan
            latest_data['6M Future Change'] = np.nan
            latest_data['1Y Future Change'] = np.nan
            try:
                latest_data['Sector'] = ticker.info['sector']
            except:
                latest_data['Sector'] = 'Unknown'
            try:
                latest_data['Industry'] = ticker.info['industry']
            except:
                latest_data['Industry'] = 'Unknown'
            for column in ticker.quarterly_cash_flow[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_cash_flow[latest_earning_date][column]
            for column in ticker.quarterly_balance_sheet[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_balance_sheet[latest_earning_date][column]
            for column in ticker.quarterly_income_stmt[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_income_stmt[latest_earning_date][column]
            ticker_df = pd.concat([ticker_df, pd.DataFrame([latest_data])], ignore_index=True)

            # Annual data
            earning_dates = ticker.cash_flow.columns.tolist()
            price_data = yf.download(symbol, period='10y', rounding=False, progress=False)
            cash_flow = ticker.cash_flow
            cash_flow_columns = cash_flow[earning_dates[0]].keys().tolist()
            balance_sheet = ticker.balance_sheet
            balance_sheet_columns = balance_sheet[earning_dates[0]].keys().tolist()
            income_statement = ticker.income_stmt
            income_statement_columns = income_statement[earning_dates[0]].keys().tolist()
            for earning_date in earning_dates:
                earning_date_data = {'Ticker': symbol, 'Date': earning_date}
                got_price = False
                day_offset = 0
                while(got_price==False and day_offset > -6):
                    try:           
                        earning_date_data['3M Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=13) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        earning_date_data['6M Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=26) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        earning_date_data['1Y Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=52) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        got_price = True
                    except:
                        day_offset += -1
                try:
                    earning_date_data['Sector'] = ticker.info['sector']
                except:
                    earning_date_data['Sector'] = 'Unknown'
                try:
                    earning_date_data['Industry'] = ticker.info['industry']
                except:
                    earning_date_data['Industry'] = 'Unknown'
                for column in cash_flow_columns:
                    earning_date_data[column] = cash_flow[earning_date][column]
                for column in balance_sheet_columns:
                    earning_date_data[column] = balance_sheet[earning_date][column]
                for column in income_statement_columns:
                    earning_date_data[column] = income_statement[earning_date][column]
                if got_price:
                    ticker_df = pd.concat([ticker_df, pd.DataFrame([earning_date_data])], ignore_index=True)

            if len(ticker_df) == 5 or ticker_df.isna().sum().sum() >= round(352 * minimum_feature_threshold):
                if symbol not in filtered_pd['Ticker'].tolist():
                    filtered_pd = pd.concat([filtered_pd, pd.DataFrame([{'Ticker': symbol}])])
                imputer = SimpleImputer()
                for column in ticker_df.columns.drop(['Ticker', 'Date', '3M Future Change', '6M Future Change', '1Y Future Change', 'Sector', 'Industry']):
                    if not ticker_df[column].isna().all():
                        ticker_df[column] = imputer.fit_transform(ticker_df[[column]])
            else:
                if symbol in filtered_pd['Ticker'].tolist():
                    filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                    if debugging:
                        print(f'Removed {symbol} from filtered tickers. Datapoints: {ticker_df.isna().sum().sum()}, Needed: {round(352 * minimum_feature_threshold)}')
                continue
            df = pd.concat([df, ticker_df], ignore_index=True)
        except Exception as error:
            if symbol in filtered_pd['Ticker'].tolist():
                filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                if debugging:
                    print(f'Removed {symbol} from filtered tickers because an exception was raised')
            continue
    filtered_pd.to_csv('../data/filtered_tickers.csv', index=False)
    df.to_csv('../data/earnings_data.csv', index=False)
else:
    df = pd.read_csv('../data/earnings_data.csv')

  0%|          | 1/909 [00:00<14:47,  1.02it/s]

YF.download() has changed argument auto_adjust default to True


 32%|███▏      | 294/909 [07:39<15:01,  1.47s/it]
1 Failed download:
['BWP']: ValueError('Length mismatch: Expected axis has 2 elements, new values have 1 elements')
 52%|█████▏    | 474/909 [12:41<20:53,  2.88s/it]
1 Failed download:
['FFBCW']: YFPricesMissingError('possibly delisted; no price data found  (period=10y) (Yahoo error = "No data found, symbol may be delisted")')
 80%|███████▉  | 723/909 [20:10<04:17,  1.39s/it]
1 Failed download:
['NYX']: YFPricesMissingError('possibly delisted; no price data found  (period=10y)')
100%|██████████| 909/909 [26:22<00:00,  1.74s/it]


### Short visualisation

In [11]:
df

Unnamed: 0,Ticker,Date,3M Future Change,6M Future Change,1Y Future Change,Sector,Industry,Free Cash Flow,Repayment Of Debt,Capital Expenditure,...,Limited Partnership Capital,Interest Received Cfi,Depletion,Excess Tax Benefit From Stock Based Compensation,Change In Dividend Payable,Net Income Extraordinary,Excise Taxes,Dividend Paid Cfo,Duefrom Related Parties Non Current,Fixed Assets Revaluation Reserve
0,BRID,2025-01-31,,,,Consumer Defensive,Packaged Foods,-5.135000e+06,-442000.0,-531000.0,...,,,,,,,,,,
1,BRID,2023-10-31,0.052885,0.010577,-0.148077,Consumer Defensive,Packaged Foods,1.382000e+06,-2234000.0,-2603000.0,...,,,,,,,,,,
2,BRID,2022-10-31,0.023490,0.095638,-0.127517,Consumer Defensive,Packaged Foods,-1.160000e+07,-38557000.0,-3770000.0,...,,,,,,,,,,
3,BRID,2021-10-31,0.034101,0.075874,0.060529,Consumer Defensive,Packaged Foods,-1.223100e+07,-4591000.0,-6239000.0,...,,,,,,,,,,
4,BRID,2020-10-31,-0.082192,-0.219726,-0.357260,Consumer Defensive,Packaged Foods,-6.896000e+06,-11456000.0,-3285750.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,PARKEN.CO,2020-12-31,0.056338,0.121127,0.104225,Communication Services,Entertainment,-5.800500e+07,-364009000.0,-333595500.0,...,,,,,,,,,,5456000.0
1169,STG.CO,2025-03-31,,,,Consumer Defensive,Tobacco,1.164650e+09,-2000000.0,-314950000.0,...,,,,,,,,,,
1170,STG.CO,2022-12-31,0.115479,-0.011691,0.021398,Consumer Defensive,Tobacco,1.003000e+09,-4000000.0,-389500000.0,...,,,,,,,,,,
1171,STG.CO,2021-12-31,0.047341,0.066552,-0.055625,Consumer Defensive,Tobacco,1.326300e+09,0.0,-240400000.0,...,,,,,,,,,,


### Removal of low datapoint columns

In [12]:
# columns_to_remove = [col for col in df.columns if df[col].notna().sum() < feature_amount]

# df = df.drop(columns=columns_to_remove)
# print(columns_to_remove)
# df

### Impution and encoding

In [13]:
imputer = SimpleImputer()
scaler = StandardScaler()
for column in df.columns.drop(['Ticker', 'Date', '3M Future Change', '6M Future Change', '1Y Future Change', 'Sector', 'Industry']):
    df[column] = imputer.fit_transform(df[[column]])
    scaler.fit(df[[column]])
    df[column] = scaler.transform(df[[column]])

le = LabelEncoder()
for column in ['Ticker', 'Sector', 'Industry']:
    df[column] = df[column].astype(str)
    le.fit(df[column])
    df[column] = le.transform(df[column])

### Splitting

In [14]:
pred_data = pd.DataFrame()
for i in range(int(len(df) / 5)):
    pred_data = pd.concat([pred_data, df.iloc[[i*5]]]) 
print('Prediction Data:')
display(pred_data)

train_data = pd.DataFrame()
for i in range(int(len(df) / 5)):
    train_data = pd.concat([train_data, df.iloc[[1+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[2+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[3+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[4+i*5]]])
print('Train and Test Data:')
display(train_data)

Prediction Data:


Unnamed: 0,Ticker,Date,3M Future Change,6M Future Change,1Y Future Change,Sector,Industry,Free Cash Flow,Repayment Of Debt,Capital Expenditure,...,Limited Partnership Capital,Interest Received Cfi,Depletion,Excess Tax Benefit From Stock Based Compensation,Change In Dividend Payable,Net Income Extraordinary,Excise Taxes,Dividend Paid Cfo,Duefrom Related Parties Non Current,Fixed Assets Revaluation Reserve
0,31,2025-01-31,,,,3,60,-0.068476,0.082292,0.183424,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
5,189,2025-01-31,,,,9,25,-0.068426,0.081270,0.183482,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
10,153,2025-03-31,,,,7,75,-0.068855,0.000000,0.183370,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
15,82,2025-03-31,,,,8,69,-0.068611,0.080679,0.183125,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
20,176,2025-03-31,,,,7,80,-0.069312,0.078864,0.181410,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,25,2024-12-31,,,,2,44,0.554222,-0.066629,-7.475388,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1150,121,2025-03-31,,,,2,44,-0.249173,-0.186227,-0.250975,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-7.881124e-01,0.0
1155,117,2024-12-31,,,,1,27,-0.064274,0.065588,0.175004,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,3.233335e+01,0.0
1160,219,2025-03-31,,,,9,78,-0.048676,0.081772,0.169683,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0


Train and Test Data:


Unnamed: 0,Ticker,Date,3M Future Change,6M Future Change,1Y Future Change,Sector,Industry,Free Cash Flow,Repayment Of Debt,Capital Expenditure,...,Limited Partnership Capital,Interest Received Cfi,Depletion,Excess Tax Benefit From Stock Based Compensation,Change In Dividend Payable,Net Income Extraordinary,Excise Taxes,Dividend Paid Cfo,Duefrom Related Parties Non Current,Fixed Assets Revaluation Reserve
1,31,2023-10-31,0.052885,0.010577,-0.148077,3,60,-0.068419,0.082272,0.183037,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
2,31,2022-10-31,0.023490,0.095638,-0.127517,3,60,-0.068533,0.081867,0.182819,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
3,31,2021-10-31,0.034101,0.075874,0.060529,3,60,-0.068539,0.082246,0.182357,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
4,31,2020-10-31,-0.082192,-0.219726,-0.357260,3,60,-0.068492,0.082169,0.182909,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
6,189,2024-04-30,0.172170,-0.346698,-0.683962,9,25,-0.068203,0.077590,0.183187,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164,219,2020-12-31,0.129412,-0.008039,-0.163529,9,78,-0.056887,0.071059,0.143917,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1166,167,2022-12-31,0.370879,0.631868,1.024725,1,27,-0.070194,0.081644,0.114151,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1167,167,2021-12-31,0.089744,0.089744,-0.066667,1,27,-0.067687,0.074818,0.128103,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1168,167,2020-12-31,0.056338,0.121127,0.104225,1,27,-0.068940,0.078231,0.121127,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0


### Labeling

In [15]:
X_pred = pred_data.drop(['Date', '3M Future Change', '6M Future Change', '1Y Future Change'], axis=1)
X_train = train_data.drop(['Date', '3M Future Change', '6M Future Change', '1Y Future Change'], axis=1)
y_train = train_data[['3M Future Change', '6M Future Change', '1Y Future Change']]

X_train, X_test, y_train, y_test = train_test_split(
    X_train,
    y_train,
    test_size=0.15,
    shuffle=True,
)

display(X_train)
display(y_train)
display(X_test)
display(y_test)

Unnamed: 0,Ticker,Sector,Industry,Free Cash Flow,Repayment Of Debt,Capital Expenditure,Interest Paid Supplemental Data,Income Tax Paid Supplemental Data,End Cash Position,Beginning Cash Position,...,Limited Partnership Capital,Interest Received Cfi,Depletion,Excess Tax Benefit From Stock Based Compensation,Change In Dividend Payable,Net Income Extraordinary,Excise Taxes,Dividend Paid Cfo,Duefrom Related Parties Non Current,Fixed Assets Revaluation Reserve
111,136,9,79,-0.068523,0.082297,0.183514,-0.072961,4.913786e-17,-0.078604,-0.079512,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
211,230,5,9,-0.062303,0.082297,0.174844,-0.056734,-9.380578e-02,-0.076912,-0.077401,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
296,115,2,46,-0.068504,0.082206,0.181994,-0.072864,-1.415340e-01,-0.078553,-0.079411,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
516,147,2,61,-0.067875,0.073623,0.179249,-0.072891,-1.387770e-01,-0.078483,-0.079412,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
887,59,7,28,-0.060432,-0.012040,-0.524992,-0.055551,2.582830e-01,-0.058853,-0.044583,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-1.996689e+00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,63,6,22,-0.067584,0.082297,0.182756,-0.072897,-1.411342e-01,-0.078003,-0.078642,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1164,219,9,78,-0.056887,0.071059,0.143917,-0.065999,-1.131591e-01,-0.048538,-0.046751,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
654,232,2,73,-0.068837,0.070732,0.124007,-0.063018,-1.403574e-01,-0.067163,-0.066269,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
418,58,6,49,-0.068631,0.082297,0.183497,-0.072975,4.913786e-17,-0.078497,-0.079388,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0


Unnamed: 0,3M Future Change,6M Future Change,1Y Future Change
111,-0.580000,-0.760000,-0.820000
211,0.115864,0.027563,0.369314
296,-0.144619,-0.449271,-0.405269
516,0.182461,-0.318071,-0.398591
887,0.065577,-0.028882,-0.059078
...,...,...,...
781,-0.127899,-0.200984,-0.084329
1164,0.129412,-0.008039,-0.163529
654,0.135336,0.084818,-0.236462
418,-0.193548,-0.496774,-0.535484


Unnamed: 0,Ticker,Sector,Industry,Free Cash Flow,Repayment Of Debt,Capital Expenditure,Interest Paid Supplemental Data,Income Tax Paid Supplemental Data,End Cash Position,Beginning Cash Position,...,Limited Partnership Capital,Interest Received Cfi,Depletion,Excess Tax Benefit From Stock Based Compensation,Change In Dividend Payable,Net Income Extraordinary,Excise Taxes,Dividend Paid Cfo,Duefrom Related Parties Non Current,Fixed Assets Revaluation Reserve
394,89,6,11,-0.068642,0.000000,-4.459434e-17,1.510220e-17,4.913786e-17,-0.078513,-0.079404,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1137,70,4,57,0.163175,0.064245,-1.454592e+00,1.510220e-17,4.913786e-17,-0.016329,-0.019383,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1163,219,9,78,-0.074955,0.072891,1.277851e-01,-6.587580e-02,-1.236186e-01,-0.047405,-0.047739,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
753,93,5,6,-0.068555,0.078122,-4.459434e-17,-7.281351e-02,4.913786e-17,-0.078601,-0.079501,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
1016,142,3,10,-0.055410,0.082297,1.396204e-01,-7.298032e-02,-5.432316e-02,-0.069420,-0.073892,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,177,2,33,-0.068322,0.081722,1.822701e-01,-7.270893e-02,-1.411515e-01,-0.078582,-0.079486,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
23,176,7,80,-0.067672,0.075785,1.741715e-01,-7.232414e-02,-1.406630e-01,-0.077885,-0.079274,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
141,217,7,3,-0.070714,0.034844,-1.157756e+00,-4.372151e-02,-1.401066e-01,-0.053285,-0.047610,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0
83,48,6,11,-0.068977,0.082297,1.832900e-01,1.510220e-17,4.913786e-17,-0.078448,-0.079323,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.143593e-15,0.0,-5.220057e-16,0.0


Unnamed: 0,3M Future Change,6M Future Change,1Y Future Change
394,-0.017857,0.455357,-0.058036
1137,-0.183009,-0.133890,-0.007779
1163,-0.141903,-0.491057,-0.410208
753,0.030394,0.079059,-0.202474
1016,0.032393,-0.119296,-0.082724
...,...,...,...
307,-0.017652,-0.098808,0.313688
23,0.757500,-0.302500,0.020000
141,0.140815,0.148439,1.403383
83,-0.117754,-0.412526,0.153468


## Model Training

In [16]:
if train_new_model:
    base_model = MLPRegressor()

    grid_search = GridSearchCV(
        estimator=base_model, 
        param_grid=param_grid, 
        scoring='r2', 
        cv=3, 
        n_jobs=3,
        verbose=verticle_jobs,
    )
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    model = grid_search.best_estimator_
    print('Best Parameters:')
    print(best_params)

    model.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib

ValueError: Input y contains NaN.

## Testing and benchmarking

In [None]:
if train_new_model:
    y_test_pred = model.predict(X_test)

    for i, target in enumerate(['3M Future Change', '6M Future Change', '1Y Future Change']):
        y_test_actual = y_test[target]
        y_test_pred_target = y_test_pred[:, i]

        plt.figure(figsize=(11, 6))
        plt.scatter(y_test_actual, y_test_pred_target, alpha=0.7, color='blue', label='Predictions')
        plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()], 
            color='red', linestyle='--', label='Perfect Fit')
        plt.title(f'Predicted vs Actual Values ({target})')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.legend()
        plt.grid(True)
        plt.show()

        mae = mean_absolute_error(y_test_actual, y_test_pred_target)
        mse = mean_squared_error(y_test_actual, y_test_pred_target)
        r2 = r2_score(y_test_actual, y_test_pred_target)

        print(f'{target} - R²: {r2:.4f}')
        print(f'{target} - MSE: {mse:.4f}')
        print(f'{target} - MAE: {mae:.4f}')

    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)

    print('\nOverall Scores:')
    print(f'Mean - R²: {r2:.4f}')
    print(f'Mean - MSE: {mse:.4f}')
    print(f'Mean - MAE: {mae:.4f}')

### Log test results

In [None]:
if train_new_model:
    test_results = pd.DataFrame({
        'R²': r2,
        'MSE': mse,
        'MAE': mae,
        'symbol_list': symbol_list,
        'hidden_layer_sizes': [model.hidden_layer_sizes],
        'max_iter': model.max_iter,
        'n_iter_no_change': model.n_iter_no_change,
        'learning_rate': model.learning_rate,
        'learning_rate_init': model.learning_rate_init,
        'batch_size': model.batch_size,
        'tol': model.tol,
        'alpha': model.alpha,
        'shuffle': model.shuffle,
    })
    test_results.to_csv('../data/test_results.csv', mode='a', index=False)

    # save model as new best if results are better than the current one
    best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']
    if r2 > best_r2:
        print(f'Old best R²: {best_r2}')
        print(f'New best R²: {r2}')
        print('Saving new best model...')
        test_results.to_csv('../models/best_model_results.csv', mode='w', index=False)
        with open('../models/best_model.pkl','wb') as f:
            pickle.dump(model,f)
        

## Predictions on latest data

In [None]:
best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']

with open('../models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)
    print(f'Best model R²: {best_r2}')


Best model R²: -0.0893079795170797


In [None]:
df_raw = pd.read_csv('../data/earnings_data.csv')

results = []
for i in range(len(X_pred)):
    y_pred = model.predict(X_pred.iloc[[i]])[0]
    y_pred_3m, y_pred_6m, y_pred_1y = y_pred
    avg = (y_pred_3m + y_pred_6m + y_pred_1y) / 3
    results.append({
        'Ticker': df_raw.loc[i*5, 'Ticker'],
        'mean (%)': avg * 100,
        '3m (%)': y_pred_3m * 100,
        '6m (%)': y_pred_6m * 100,
        '1y (%)': y_pred_1y * 100
    })

results_df = pd.DataFrame(results)
results_df

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Amortization Of Securities
- Cash Cash Equivalents And Federal Funds Sold
- Cash Flow From Discontinued Operation
- Cash Flowsfromusedin Operating Activities Direct
- Cash From Discontinued Financing Activities
- ...


In [None]:
def predict_ticker(ticker_str):
    try:
        ticker_str = str(ticker_str).upper()
        ticker_str = ticker_str.replace("'", "")
        ticker_str = ticker_str.replace('"', "")
        row = results_df[results_df['Ticker'] == ticker_str]
        if row.empty:
            return (f"The author was stupid and forgot to cover this obvious, famous stock. Try another.", "", "", "")
        row = row.iloc[0]
        return (
            f"{row['3m (%)']:.2f}",
            f"{row['6m (%)']:.2f}",
            f"{row['1y (%)']:.2f}",
            f"{row['mean (%)']:.2f}",
        )
    except Exception as e:
        return (f"Error: {e}", "", "", "")

In [None]:
iface = gr.Interface(
    fn=predict_ticker,
    inputs=gr.Textbox(label="Ticker (e.g. 'TSLA' or 'MATAS.CO')"),
    outputs=[
        gr.Textbox(label="3 Month Change Prediction (%)"),
        gr.Textbox(label="6 Month Change Prediction (%)"),
        gr.Textbox(label="1 Year Change Prediction (%)"),
        gr.Textbox(label="Mean Change Prediction (%)"),
    ],
    title="Stock Price Prediction Model",
    description=" "
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://a8c968acf9d317b028.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


