# Stock Prediction Model

## Dependencies

### Library Installation (if needed)

In [48]:
#! pip install yfinance pandas numpy matplotlib lightgbm torch tensorflow keras

### Importing Required Libraries

In [49]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pickle
import gradio as gr
import ast
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

### Variables

In [50]:
build_new_dataset = {'True': True, 'False': False}.get(input('Build New Dataset? (Bool)'))
if build_new_dataset:
    symbol_list = input("Symbols: ('simple', 'filtered' or 'all')") # 'simple' or 'all'. simple are the tickers from the screener notebook and all are all of the tickers in 'filtered_tickers'. You may also an 'Int' to get a % amount of random tickers from 'all'
else:
    symbol_list = 'filtered'

train_new_model = {'True': True, 'False': False}.get(input('Train New Model? (Bool)'))
if train_new_model:
    minimum_feature_threshold = 0.9
    param_grid = {
            'shuffle': [True], 
            'solver': ['adam'], 
            'learning_rate': ['adaptive'],
            'tol': [0.0001],
            'max_iter': [200],
            'alpha': [0.0001],
            'hidden_layer_sizes': [ast.literal_eval(input('Hidden Layers: eg. "(100, 100), (500, 500)"'))],
            'learning_rate_init': [0.001],
            'n_iter_no_change': [9]
        }
    verticle_jobs = 1 #'-1' for max

if build_new_dataset or train_new_model:
    debugging = {'True': True, 'False': False}.get(input('Debug? (Bool)'))
else:
    debugging = False

## Data Preparation

In [51]:
symbols = []
if symbol_list == 'filtered':
    symbols = pd.read_csv('../data/filtered_tickers.csv')['Ticker'].tolist()
elif symbol_list == 'all':
    symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
elif symbol_list.isdigit():
    all_symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
    num_symbols = max(1, round(len(all_symbols) * (int(symbol_list) / 100)))  
    symbols = np.random.choice(all_symbols, num_symbols, replace=False).tolist()
    symbols = symbols + pd.read_csv("../data/filtered_tickers.csv")["Ticker"].tolist()

symbols = symbols + pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()

symbols = pd.Series(symbols).unique()
symbols

array(['MATAS.CO', 'TRIFOR.CO', 'RNMBY', 'SAABF', 'BCKIY', 'BAESY',
       'IVSO.ST', 'NSKFF', 'GMAB', 'GN.CO', 'NVDA', 'LLY', 'DANSKE.CO',
       'CARL-B.CO', 'MAERSK-B.CO', 'RBREW.CO', 'ISS.CO', 'DSV.CO',
       'SCHO.CO', 'NETC.CO', 'JYSK.CO', 'ABBN.SW', 'TER', 'PARKEN.CO',
       'NFLX', 'TRMD-A.CO', 'STG.CO', 'NOVO-B.CO', 'EQNR', 'NKT.CO',
       'KCC.OL', 'TSLA', 'HEM.ST', 'DEMANT.CO', 'BAVA.CO ', 'BABA', 'JD',
       'PDD', 'BIDU', 'NTES', 'WB', 'IQ', 'SYDB.CO', 'UBER'], dtype=object)

#### Download annual financial data

In [52]:
if build_new_dataset:
    df = pd.DataFrame()
    filtered_pd = pd.read_csv('../data/filtered_tickers.csv')
    for symbol in tqdm(symbols):
        ticker_df = pd.DataFrame()
        ticker = yf.Ticker(symbol)
        try:
            # Latest earning data
            latest_earning_date = ticker.quarterly_cash_flow.columns.tolist()[0]
            latest_data = {'Ticker': symbol, 'Date': latest_earning_date}
            latest_data['3M Future Change'] = np.nan
            latest_data['6M Future Change'] = np.nan
            latest_data['1Y Future Change'] = np.nan
            try:
                latest_data['Sector'] = ticker.info['sector']
            except:
                latest_data['Sector'] = 'Unknown'
            try:
                latest_data['Industry'] = ticker.info['industry']
            except:
                latest_data['Industry'] = 'Unknown'
            for column in ticker.quarterly_cash_flow[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_cash_flow[latest_earning_date][column]
            for column in ticker.quarterly_balance_sheet[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_balance_sheet[latest_earning_date][column]
            for column in ticker.quarterly_income_stmt[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_income_stmt[latest_earning_date][column]
            ticker_df = pd.concat([ticker_df, pd.DataFrame([latest_data])], ignore_index=True)

            # Annual data
            earning_dates = ticker.cash_flow.columns.tolist()
            price_data = yf.download(symbol, period='10y', rounding=False, progress=False)
            cash_flow = ticker.cash_flow
            cash_flow_columns = cash_flow[earning_dates[0]].keys().tolist()
            balance_sheet = ticker.balance_sheet
            balance_sheet_columns = balance_sheet[earning_dates[0]].keys().tolist()
            income_statement = ticker.income_stmt
            income_statement_columns = income_statement[earning_dates[0]].keys().tolist()
            for earning_date in earning_dates:
                earning_date_data = {'Ticker': symbol, 'Date': earning_date}
                got_price = False
                day_offset = 0
                while(got_price==False and day_offset > -6):
                    try:           
                        earning_date_data['3M Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=13) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        earning_date_data['6M Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=26) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        earning_date_data['1Y Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=52) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        got_price = True
                    except:
                        day_offset += -1
                try:
                    earning_date_data['Sector'] = ticker.info['sector']
                except:
                    earning_date_data['Sector'] = 'Unknown'
                try:
                    earning_date_data['Industry'] = ticker.info['industry']
                except:
                    earning_date_data['Industry'] = 'Unknown'
                for column in cash_flow_columns:
                    earning_date_data[column] = cash_flow[earning_date][column]
                for column in balance_sheet_columns:
                    earning_date_data[column] = balance_sheet[earning_date][column]
                for column in income_statement_columns:
                    earning_date_data[column] = income_statement[earning_date][column]
                if got_price:
                    ticker_df = pd.concat([ticker_df, pd.DataFrame([earning_date_data])], ignore_index=True)

            if len(ticker_df) == 5 and ticker_df.isna().sum().sum() < round(352 * minimum_feature_threshold):
                if symbol not in filtered_pd['Ticker'].tolist():
                    filtered_pd = pd.concat([filtered_pd, pd.DataFrame([{'Ticker': symbol}])])
                imputer = SimpleImputer()
                for column in ticker_df.columns.drop(['Ticker', 'Date', '3M Future Change', '6M Future Change', '1Y Future Change', 'Sector', 'Industry']):
                    if not ticker_df[column].isna().all():
                        ticker_df[column] = imputer.fit_transform(ticker_df[[column]])
            else:
                if symbol in filtered_pd['Ticker'].tolist():
                    filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                    if debugging:
                        print(f'Removed {symbol} from filtered tickers. Datapoints: {ticker_df.isna().sum().sum()}, Needed: {round(352 * minimum_feature_threshold)}')
                continue
            df = pd.concat([df, ticker_df], ignore_index=True)
        except Exception as error:
            if symbol in filtered_pd['Ticker'].tolist():
                filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                if debugging:
                    print(f'Removed {symbol} from filtered tickers because an exception was raised')
            continue
    filtered_pd.to_csv('../data/filtered_tickers.csv', index=False)
    df.to_csv('../data/earnings_data.csv', index=False)
else:
    df = pd.read_csv('../data/earnings_data.csv')

100%|██████████| 44/44 [01:16<00:00,  1.75s/it]


### Short visualisation

In [53]:
df

Unnamed: 0,Ticker,Date,3M Future Change,6M Future Change,1Y Future Change,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
0,MATAS.CO,2025-03-31,,,,Consumer Cyclical,Specialty Retail,448425000.0,-24025000.0,-437100000.0,...,,,,,,,,,,
1,MATAS.CO,2024-03-31,0.006267,0.065357,0.159205,Consumer Cyclical,Specialty Retail,239000000.0,-21000000.0,-189000000.0,...,,,,,,,,,,
2,MATAS.CO,2023-03-31,0.014033,0.205079,0.459568,Consumer Cyclical,Specialty Retail,424000000.0,0.0,-126000000.0,...,,,,,,,,,,
3,MATAS.CO,2022-03-31,-0.211179,-0.301726,-0.110513,Consumer Cyclical,Specialty Retail,327100000.0,-75100000.0,-708400000.0,...,,,,,,,,,,
4,MATAS.CO,2021-03-31,0.391748,0.478732,0.209816,Consumer Cyclical,Specialty Retail,803600000.0,0.0,-725000000.0,...,,,,,,,,,,
5,RNMBY,2025-03-31,,,,Industrials,Aerospace & Defense,267000000.0,,-494000000.0,...,,,,,,,,,,
6,RNMBY,2023-12-31,0.768973,0.616452,1.071535,Industrials,Aerospace & Defense,345000000.0,,-485000000.0,...,,,,,,,,,,
7,RNMBY,2022-12-31,0.493045,0.398488,0.633525,Industrials,Aerospace & Defense,-175000000.0,,-232000000.0,...,,,,,,,,,,
8,RNMBY,2021-12-31,1.329688,1.491433,1.219995,Industrials,Aerospace & Defense,419000000.0,,-231000000.0,...,,,,,,,,,,
9,RNMBY,2020-12-31,-0.075638,-0.041881,-0.106959,Industrials,Aerospace & Defense,214000000.0,,-360500000.0,...,,,,,,,,,,


### Removal of low datapoint columns

In [54]:
# columns_to_remove = [col for col in df.columns if df[col].notna().sum() < feature_amount]

# df = df.drop(columns=columns_to_remove)
# print(columns_to_remove)
# df

### Impution and encoding

In [55]:
imputer = SimpleImputer()
scaler = StandardScaler()
for column in df.columns.drop(['Ticker', 'Date', '3M Future Change', '6M Future Change', '1Y Future Change', 'Sector', 'Industry']):
    df[column] = imputer.fit_transform(df[[column]])
    scaler.fit(df[[column]])
    df[column] = scaler.transform(df[[column]])

le = LabelEncoder()
for column in ['Ticker', 'Sector', 'Industry']:
    df[column] = df[column].astype(str)
    le.fit(df[column])
    df[column] = le.transform(df[column])

### Splitting

In [56]:
pred_data = pd.DataFrame()
for i in range(int(len(df) / 5)):
    pred_data = pd.concat([pred_data, df.iloc[[i*5]]]) 
print('Prediction Data:')
display(pred_data)

train_data = pd.DataFrame()
for i in range(int(len(df) / 5)):
    train_data = pd.concat([train_data, df.iloc[[1+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[2+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[3+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[4+i*5]]])
print('Train and Test Data:')
display(train_data)

Prediction Data:


Unnamed: 0,Ticker,Date,3M Future Change,6M Future Change,1Y Future Change,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
0,5,2025-03-31,,,,1,9,-0.355948,0.423208,0.554447,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,2025-03-31,,,,4,0,-0.360472,0.0,0.542468,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,7,2025-03-31,,,,4,0,-0.372244,0.415562,0.646466,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,2,2025-03-31,,,,3,2,-0.360722,0.424533,0.646045,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,4,2025-03-31,,,,3,3,-0.407056,0.358351,0.403284,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,8,2025-03-31,,,,5,7,-0.364697,0.415848,0.614185,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,1,2025-03-31,,,,2,6,-0.217146,0.394255,0.569836,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,9,2025-03-31,,,,1,1,-0.350571,0.0,0.362471,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0,2024-12-31,,,,1,5,1.401436,-2.684507,-2.160169,...,0.0,-7.175517e-15,2.004496,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,3,2024-12-31,,,,0,4,-0.355323,0.0,0.331565,...,0.0,-7.175517e-15,-1.049674,5.696487,-0.704384,-2.395034,6.421057,0.0,0.0,0.0


Train and Test Data:


Unnamed: 0,Ticker,Date,3M Future Change,6M Future Change,1Y Future Change,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
1,5,2024-03-31,0.006267,0.065357,0.159205,1,9,-0.361171,0.423375,0.606677,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2023-03-31,0.014033,0.205079,0.459568,1,9,-0.356557,0.424533,0.61994,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,2022-03-31,-0.211179,-0.301726,-0.110513,1,9,-0.358974,0.420391,0.497332,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2021-03-31,0.391748,0.478732,0.209816,1,9,-0.34709,0.424533,0.493837,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,2023-12-31,0.768973,0.616452,1.071535,4,0,-0.358527,0.0,0.544363,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,6,2022-12-31,0.493045,0.398488,0.633525,4,0,-0.371496,0.0,0.597625,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,6,2021-12-31,1.329688,1.491433,1.219995,4,0,-0.356682,0.0,0.597835,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,6,2020-12-31,-0.075638,-0.041881,-0.106959,4,0,-0.361794,0.0,0.570573,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,7,2023-12-31,0.49819,0.678304,0.55913,4,0,-0.294209,0.415562,0.420154,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,7,2022-12-31,0.533624,0.443572,0.628164,4,0,-0.308674,0.424533,0.354682,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
for i in train_data.index:
    if pd.isna(train_data.loc[i, "3M Future Change"]):
        print(i)

### Labeling

In [58]:
X_pred = pred_data.drop(['Date', '3M Future Change', '6M Future Change', '1Y Future Change'], axis=1)
X_train = train_data.drop(['Date', '3M Future Change', '6M Future Change', '1Y Future Change'], axis=1)
y_train = train_data[['3M Future Change', '6M Future Change', '1Y Future Change']]

X_train, X_test, y_train, y_test = train_test_split(
    X_train,
    y_train,
    test_size=0.15,
    shuffle=True,
)

display(X_train)
display(y_train)
display(X_test)
display(y_test)

Unnamed: 0,Ticker,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Issuance Of Capital Stock,Capital Expenditure,End Cash Position,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
21,4,3,3,-0.44574,0.383169,0.646466,-0.026195,-0.487968,-0.144722,-0.34671,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,1,2,6,0.052434,0.289769,0.11316,0.604459,0.0,-0.162465,-0.229716,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,1,2,6,0.290716,0.241704,0.306262,-0.595056,0.0,-0.261296,-0.177546,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,2,3,2,-0.192457,0.393427,0.627308,0.0,-0.487968,0.454075,-0.186985,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,7,4,0,-0.315657,0.411186,0.383365,-0.419267,-0.487968,0.274358,-0.350791,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,3,0,4,-0.52556,0.0,-1.236308,0.133448,-0.208181,0.451964,-0.343313,...,0.0,-7.175517e-15,-1.08677,0.718013,-0.629169,6.415034,-2.182198,0.0,0.0,0.0
49,3,0,4,-0.385666,0.0,-1.055478,0.250855,0.30866,0.468649,-0.31843,...,0.0,-7.175517e-15,-1.077734,0.0,-0.388196,0.0,0.0,0.0,0.0,0.0
23,4,3,3,-0.232771,0.355593,0.245336,-0.248609,-0.487968,0.317317,-0.333455,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,0,1,5,4.336437,0.381901,-0.840034,4.924695,-0.390117,-3.199507,4.341622,...,-3.387058,-7.175517e-15,6.191313,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,4,3,3,-0.334492,0.35973,0.403284,-0.134108,-0.487968,0.15284,-0.344977,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,3M Future Change,6M Future Change,1Y Future Change
21,0.341695,0.570432,0.376126
34,0.2065,0.312575,0.667386
32,-0.183009,-0.13389,-0.007779
16,-0.06385,-0.199374,-0.355869
14,-0.002849,-0.058484,0.041659
48,0.048246,-0.041667,0.162281
49,-0.048055,-0.134439,-0.731121
23,0.063977,0.184433,0.342553
44,0.000221,-0.349094,-0.48582
24,0.101715,0.380721,0.666974


Unnamed: 0,Ticker,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Issuance Of Capital Stock,Capital Expenditure,End Cash Position,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
54,10,5,8,-0.334342,0.424533,0.43468,-0.375186,-0.314632,0.468093,-0.284364,...,0.0,-7.175517e-15,-0.926762,0.0,0.388196,0.0,0.0,0.0,-5.244044,-5.244044
26,8,5,7,-0.356517,0.402625,0.635884,-0.595056,0.0,0.472541,-0.374033,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,10,5,8,-0.283285,0.424533,0.042056,-0.18923,-0.487968,0.467133,-0.291225,...,0.0,-7.175517e-15,-0.840884,0.0,-2.389644,0.0,0.0,-0.60553,0.0,0.0
17,2,3,2,-0.277475,0.374455,0.631098,0.0,-0.487968,0.459111,-0.252925,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0,1,5,3.365377,-4.469927,-3.165046,2.360978,-0.016605,-2.324197,3.413038,...,5.856455,5.949035,1.574694,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,9,-0.361171,0.423375,0.606677,-0.433961,0.0,0.451515,-0.38234,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2,3,2,-0.287126,0.398074,0.634677,0.0,-0.487968,0.465362,-0.266964,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,3M Future Change,6M Future Change,1Y Future Change
54,0.129412,-0.008039,-0.163529
26,0.022849,0.338933,0.189582
51,0.219354,0.114032,-0.025024
17,-0.109014,-0.103115,-0.248702
41,0.021875,0.483603,0.915115
1,0.006267,0.065357,0.159205
19,-0.189867,0.021889,-0.019675


## Model Training

In [59]:
if train_new_model:
    base_model = MLPRegressor()

    grid_search = GridSearchCV(
        estimator=base_model, 
        param_grid=param_grid, 
        scoring='r2', 
        cv=3, 
        n_jobs=3,
        verbose=verticle_jobs,
    )
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    model = grid_search.best_estimator_
    print('Best Parameters:')
    print(best_params)

    model.fit(X_train, y_train)

## Testing and benchmarking

In [60]:
if train_new_model:
    y_test_pred = model.predict(X_test)

    for i, target in enumerate(['3M Future Change', '6M Future Change', '1Y Future Change']):
        y_test_actual = y_test[target]
        y_test_pred_target = y_test_pred[:, i]

        plt.figure(figsize=(11, 6))
        plt.scatter(y_test_actual, y_test_pred_target, alpha=0.7, color='blue', label='Predictions')
        plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()], 
            color='red', linestyle='--', label='Perfect Fit')
        plt.title(f'Predicted vs Actual Values ({target})')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.legend()
        plt.grid(True)
        plt.show()

        mae = mean_absolute_error(y_test_actual, y_test_pred_target)
        mse = mean_squared_error(y_test_actual, y_test_pred_target)
        r2 = r2_score(y_test_actual, y_test_pred_target)

        print(f'{target} - R²: {r2:.4f}')
        print(f'{target} - MSE: {mse:.4f}')
        print(f'{target} - MAE: {mae:.4f}')

    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)

    print('\nOverall Scores:')
    print(f'Mean - R²: {r2:.4f}')
    print(f'Mean - MSE: {mse:.4f}')
    print(f'Mean - MAE: {mae:.4f}')

### Log test results

In [61]:
if train_new_model:
    test_results = pd.DataFrame({
        'R²': r2,
        'MSE': mse,
        'MAE': mae,
        'symbol_list': symbol_list,
        'hidden_layer_sizes': [model.hidden_layer_sizes],
        'max_iter': model.max_iter,
        'n_iter_no_change': model.n_iter_no_change,
        'learning_rate': model.learning_rate,
        'learning_rate_init': model.learning_rate_init,
        'batch_size': model.batch_size,
        'tol': model.tol,
        'alpha': model.alpha,
        'shuffle': model.shuffle,
    })
    test_results.to_csv('../data/test_results.csv', mode='a', index=False)

    # save model as new best if results are better than the current one
    best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']
    if r2 > best_r2:
        print(f'Old best R²: {best_r2}')
        print(f'New best R²: {r2}')
        print('Saving new best model...')
        test_results.to_csv('../models/best_model_results.csv', mode='w', index=False)
        with open('../models/best_model.pkl','wb') as f:
            pickle.dump(model,f)
        

## Predictions on latest data

In [62]:
best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']

with open('../models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)
    print(f'Best model R²: {best_r2}')


FileNotFoundError: [Errno 2] No such file or directory: '../models/best_model.pkl'

In [None]:
df_raw = pd.read_csv('../data/earnings_data.csv')

results = []
for i in range(len(X_pred)):
    y_pred = model.predict(X_pred.iloc[[i]])[0]
    y_pred_3m, y_pred_6m, y_pred_1y = y_pred
    avg = (y_pred_3m + y_pred_6m + y_pred_1y) / 3
    results.append({
        'Ticker': df_raw.loc[i*5, 'Ticker'],
        'mean (%)': avg * 100,
        '3m (%)': y_pred_3m * 100,
        '6m (%)': y_pred_6m * 100,
        '1y (%)': y_pred_1y * 100
    })

results_df = pd.DataFrame(results)
results_df

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Amortization Of Securities
- Cash Cash Equivalents And Federal Funds Sold
- Cash Flow From Discontinued Operation
- Cash Flowsfromusedin Operating Activities Direct
- Cash From Discontinued Financing Activities
- ...


In [None]:
def predict_ticker(ticker_str):
    try:
        ticker_str = str(ticker_str).upper()
        ticker_str = ticker_str.replace("'", "")
        ticker_str = ticker_str.replace('"', "")
        row = results_df[results_df['Ticker'] == ticker_str]
        if row.empty:
            return (f"The author was stupid and forgot to cover this obvious, famous stock. Try another.", "", "", "")
        row = row.iloc[0]
        return (
            f"{row['3m (%)']:.2f}",
            f"{row['6m (%)']:.2f}",
            f"{row['1y (%)']:.2f}",
            f"{row['mean (%)']:.2f}",
        )
    except Exception as e:
        return (f"Error: {e}", "", "", "")

In [None]:
iface = gr.Interface(
    fn=predict_ticker,
    inputs=gr.Textbox(label="Ticker (e.g. 'TSLA' or 'MATAS.CO')"),
    outputs=[
        gr.Textbox(label="3 Month Change Prediction (%)"),
        gr.Textbox(label="6 Month Change Prediction (%)"),
        gr.Textbox(label="1 Year Change Prediction (%)"),
        gr.Textbox(label="Mean Change Prediction (%)"),
    ],
    title="Stock Price Prediction Model",
    description=" "
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://a8c968acf9d317b028.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


