# Stock Price Prediction Model

## Dependencies

### Library Installation (if needed)

In [1]:
#! pip install -Ur requirements.txt

### Importing Required Libraries

In [2]:
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pickle
import gradio as gr
import ast
import warnings
from methods.model_methods import *
from datetime import datetime
from tqdm import tqdm
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
display(yf.Ticker("AAPL").get_financials(freq="yearly"))
display(yf.Ticker("AAPL").get_financials(freq="quarterly"))

Unnamed: 0,2024-09-30,2023-09-30,2022-09-30,2021-09-30,2020-09-30
TaxEffectOfUnusualItems,0.0,0.0,0.0,0.0,
TaxRateForCalcs,0.241,0.147,0.162,0.133,
NormalizedEBITDA,134661000000.0,125820000000.0,130541000000.0,123136000000.0,
NetIncomeFromContinuingOperationNetMinorityInterest,93736000000.0,96995000000.0,99803000000.0,94680000000.0,
ReconciledDepreciation,11445000000.0,11519000000.0,11104000000.0,11284000000.0,
ReconciledCostOfRevenue,210352000000.0,214137000000.0,223546000000.0,212981000000.0,
EBITDA,134661000000.0,125820000000.0,130541000000.0,123136000000.0,
EBIT,123216000000.0,114301000000.0,119437000000.0,111852000000.0,
NetInterestIncome,,-183000000.0,-106000000.0,198000000.0,890000000.0
InterestExpense,,3933000000.0,2931000000.0,2645000000.0,2873000000.0


Unnamed: 0,2025-06-30,2025-03-31,2024-12-31,2024-09-30,2024-06-30
TaxEffectOfUnusualItems,0.0,0.0,0.0,0.0,0.0
TaxRateForCalcs,0.163997,0.155,0.147,0.21,0.159
NormalizedEBITDA,31032000000.0,32250000000.0,45912000000.0,32502000000.0,28202000000.0
NetIncomeFromContinuingOperationNetMinorityInterest,23434000000.0,24780000000.0,36330000000.0,14736000000.0,21448000000.0
ReconciledDepreciation,2830000000.0,2661000000.0,3080000000.0,2911000000.0,2850000000.0
ReconciledCostOfRevenue,50318000000.0,50492000000.0,66025000000.0,51051000000.0,46099000000.0
EBITDA,31032000000.0,32250000000.0,45912000000.0,32502000000.0,28202000000.0
EBIT,28202000000.0,29589000000.0,42832000000.0,29591000000.0,25352000000.0
NormalizedIncome,23434000000.0,24780000000.0,36330000000.0,14736000000.0,21448000000.0
NetIncomeFromContinuingAndDiscontinuedOperation,23434000000.0,24780000000.0,36330000000.0,14736000000.0,21448000000.0


In [4]:
Stock("RDDT").get_annual_financials()

Unnamed: 0,2024-12-31,2023-12-31,2022-12-31,2021-12-31
TaxEffectOfUnusualItems,0.0,0.0,0.0,0.0
TaxRateForCalcs,0.001,0.21,0.21,0.27
NormalizedEBITDA,-544925000.0,-126459000.0,-164162000.0,-124400000.0
NetIncomeFromContinuingOperationNetMinorityInterest,-484276000.0,-90824000.0,-158550000.0,-127896000.0
ReconciledDepreciation,15643000.0,13702000.0,8000000.0,2813000.0
ReconciledCostOfRevenue,123595000.0,111011000.0,104799000.0,72565000.0
EBITDA,-544925000.0,-126459000.0,-164162000.0,-124400000.0
EBIT,-560568000.0,-140161000.0,-172162000.0,-127213000.0
NormalizedIncome,-484276000.0,-90824000.0,-158550000.0,-127896000.0
NetIncomeFromContinuingAndDiscontinuedOperation,-484276000.0,-90824000.0,-158550000.0,-127896000.0


In [5]:
Stock("RDDT").get_quarterly_financials()


Unnamed: 0,2025-06-30,2025-03-31,2024-09-30,2024-06-30
TaxEffectOfUnusualItems,0.0,0.0,0.0,0.0
TaxRateForCalcs,0.21,0.4,0.21,0.02094
NormalizedEBITDA,71645000.0,7866000.0,10823000.0,-27269000.0
NetIncomeFromContinuingOperationNetMinorityInterest,89297000.0,26158000.0,29853000.0,-10099000.0
ReconciledDepreciation,3934000.0,3963000.0,3969000.0,3770000.0
ReconciledCostOfRevenue,45900000.0,37089000.0,34633000.0,29501000.0
EBITDA,71645000.0,7866000.0,10823000.0,-27269000.0
EBIT,67711000.0,3903000.0,6854000.0,-31039000.0
NormalizedIncome,89297000.0,26158000.0,29853000.0,-10099000.0
NetIncomeFromContinuingAndDiscontinuedOperation,89297000.0,26158000.0,29853000.0,-10099000.0


In [8]:
display(Stock("AAPL").get_financials())

Unnamed: 0,2025-06-30,2025-03-31,2024-12-31,2024-09-30,2024-06-30,2023-09-30,2022-09-30,2021-09-30
TaxEffectOfUnusualItems,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TaxRateForCalcs,0.163997,0.155,0.147,0.241,0.159,0.147,0.162,0.133
NormalizedEBITDA,31032000000.0,32250000000.0,45912000000.0,134661000000.0,28202000000.0,125820000000.0,130541000000.0,123136000000.0
NetIncomeFromContinuingOperationNetMinorityInterest,23434000000.0,24780000000.0,36330000000.0,93736000000.0,21448000000.0,96995000000.0,99803000000.0,94680000000.0
ReconciledDepreciation,2830000000.0,2661000000.0,3080000000.0,11445000000.0,2850000000.0,11519000000.0,11104000000.0,11284000000.0
ReconciledCostOfRevenue,50318000000.0,50492000000.0,66025000000.0,210352000000.0,46099000000.0,214137000000.0,223546000000.0,212981000000.0
EBITDA,31032000000.0,32250000000.0,45912000000.0,134661000000.0,28202000000.0,125820000000.0,130541000000.0,123136000000.0
EBIT,28202000000.0,29589000000.0,42832000000.0,123216000000.0,25352000000.0,114301000000.0,119437000000.0,111852000000.0
NetInterestIncome,,,,,,-183000000.0,-106000000.0,198000000.0
InterestExpense,,,,,,3933000000.0,2931000000.0,2645000000.0


### Variables

In [7]:
symbol_list = input("Symbols: ('simple', 'filtered' or 'all')") # 'simple' or 'all'. simple are the tickers from the screener notebook and all are all of the tickers in 'filtered_tickers'. You may also an 'Int' to get a % amount of random tickers from 'all'
if symbol_list:
    build_new_dataset = True
else:
    build_new_dataset = False
    symbol_list = 'filtered'
minimum_feature_threshold = 0.6
outlier = 3
iterations = input("Iterations eg. '10', '20', '30'")
if iterations:
    iterations = ast.literal_eval(iterations)
    train_new_model = True
    search_params = {
            "hidden_layer_amount": Integer(2, 20),
            "neuron_amount": Integer(20, 4000),
            "warm_start": Categorical([False, True]),
            "activation": Categorical(['identity', 'logistic', 'tanh', 'relu']),
            "solver": Categorical(['sgd', 'adam', 'lbfgs']),
            "alpha": Real(0.000001, 1),
            "learning_rate_init": Real(0.00001, 0.1),
            "power_t": Real(0.0001, 100),
            "momentum": Real(0.0001, 100),
            "validation_fraction": Real(0.05, 0.20),
            "beta_1": Real(0.001, 10),
            "beta_2": Real(0.0001, 100),
            "epsilon": Real(0.0000000001, 0.000001),}
    cross_validations = 2  # will be set to 3 if not specified
    verticle_jobs = 1 #'-1' for max
else:
    train_new_model = False
if build_new_dataset or train_new_model:
    debugging = {'True': True, 'False': False}.get(input('Debug? (Bool)'))
else:
    debugging = False

KeyboardInterrupt: Interrupted by user

In [None]:
symbols = pd.read_csv('../data/simple_tickers.csv')['Ticker'].tolist()
if symbol_list == 'filtered':
    symbols = symbols + pd.read_csv('../data/filtered_tickers.csv')['Ticker'].tolist()
elif symbol_list == 'all':
    symbols = symbols + pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
elif symbol_list.isdigit():
    all_symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
    num_symbols = max(1, round(len(all_symbols) * (int(symbol_list) / 100)))  
    symbols = symbols + pd.read_csv("../data/filtered_tickers.csv")["Ticker"].tolist()
    symbols = symbols + np.random.choice(all_symbols, num_symbols, replace=False).tolist()

symbols = pd.Series(symbols).unique()
symbols

array(['MATAS.CO', 'TRIFOR.CO', 'RNMBY', 'SAABF', 'BCKIY', 'BAESY',
       'IVSO.ST', 'NSKFF', 'GMAB', 'GN.CO', 'NVDA', 'LLY', 'DANSKE.CO',
       'CARL-B.CO', 'MAERSK-B.CO', 'RBREW.CO', 'ISS.CO', 'DSV.CO',
       'SCHO.CO', 'NETC.CO', 'JYSK.CO', 'ABBN.SW', 'TER', 'PARKEN.CO',
       'NFLX', 'STG.CO', 'NOVO-B.CO', 'EQNR', 'NKT.CO', 'TSLA', 'HEM.ST',
       'DEMANT.CO', 'BAVA.CO ', 'BABA', 'JD', 'PDD', 'BIDU', 'NTES', 'WB',
       'IQ', 'SYDB.CO', 'UBER', 'COLO-B.CO'], dtype=object)

## Data Preparation

In [None]:
for symbol in tqdm(symbols):
    display(Stock(symbol).get_key_financials())

  0%|          | 0/43 [00:00<?, ?it/s]


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


#### Download annual financial data

In [None]:
if build_new_dataset:
    df = pd.DataFrame()
    filtered_pd = pd.read_csv('../data/filtered_tickers.csv')
    for symbol in tqdm(symbols, smoothing=0):
        ticker_df = pd.DataFrame()
        ticker = yf.Ticker(symbol)
        stock = Stock(symbol)
        try:
            ticker_df = Stock(symbol).get_data_key()
            if ticker_df.isna().sum().sum() < round(29 * minimum_feature_threshold):
                if symbol not in filtered_pd['Ticker'].tolist():
                    filtered_pd = pd.concat([filtered_pd, pd.DataFrame([{'Ticker': symbol}])])
                imputer = SimpleImputer()
                for column in ticker_df.columns.drop(['Ticker', 'Name', 'Date', '3M Future Change', '6M Future Change', '9M Future Change', '1Y Future Change', 'Sector', 'Industry']):
                    if not ticker_df[column].isna().all():
                        ticker_df[column] = imputer.fit_transform(ticker_df[[column]])
            else:
                if symbol in filtered_pd['Ticker'].tolist():
                    filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                    if debugging:
                        print(f'Removed {symbol} from filtered tickers. Datapoints: {ticker_df.isna().sum().sum()}, Needed: {round(29 * minimum_feature_threshold)}')
                continue
            df = pd.concat([df, ticker_df], ignore_index=True)
        except Exception as error:
            if symbol in filtered_pd['Ticker'].tolist():
                filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                if debugging:
                    print(f'Removed {symbol} from filtered tickers because an exception was raised \n {error}')
            else:
                if debugging:
                    print(f"{symbol}: exception raised: {error}")
            continue
    filtered_pd.to_csv('../data/filtered_tickers.csv', index=False)
    df.to_csv('../data/earnings_data.csv', index=False)
else:
    df = pd.read_csv('../data/earnings_data.csv')

 33%|███▎      | 14/43 [04:34<09:27, 19.58s/it]


KeyboardInterrupt: 

### Short visualisation

In [None]:
if debugging:
    display(df)

### Impution and encoding

In [None]:
imputer = SimpleImputer()
scaler = StandardScaler()
for column in df.columns.drop(['Ticker', 'Name', 'Date', '3M Future Change', '6M Future Change', '9M Future Change', '1Y Future Change', 'Sector', 'Industry']):
    df[column] = imputer.fit_transform(df[[column]])
    scaler.fit(df[[column]])
    df[column] = scaler.transform(df[[column]])

le = LabelEncoder()
for column in ['Sector', 'Industry']:
    df[column] = df[column].astype(str)
    le.fit(df[column])
    df[column] = le.transform(df[column])

### Splitting

In [None]:
pred_data = pd.DataFrame()
test_data = pd.DataFrame()
train_data = pd.DataFrame()
for i in tqdm(range(int(len(df) / 4)), smoothing=0):
    cont = False
    for j in range(4):
        if (df.loc[j+i*4, "3M Future Change"] > outlier or
            df.loc[j+i*4, "6M Future Change"] > outlier or
            df.loc[j+i*4, "9M Future Change"] > outlier or
            df.loc[j+i*4, "1Y Future Change"] > outlier):
            cont = True
    if cont:
        continue
    pred_data = pd.concat([pred_data, df.iloc[[i*4]]]) 
    test_data = pd.concat([test_data, df.iloc[[1+i*4]]])
    train_data = pd.concat([train_data, df.iloc[[2+i*4]]])
    train_data = pd.concat([train_data, df.iloc[[3+i*4]]])
if debugging:
    print('Prediction Data:')
    display(pred_data)
    print("Test Data:")
    display(test_data)
    print('Training Data:')
    display(train_data)

0it [00:00, ?it/s]


### Labeling

In [None]:
training_columns = train_data.columns.drop(["Ticker", "Name", "Sector", "Industry", "Date", '3M Future Change', '6M Future Change', '9M Future Change', '1Y Future Change'])
label_columns = ['3M Future Change', '6M Future Change', '9M Future Change', '1Y Future Change']
X_pred = pred_data[training_columns]
X_test = test_data[training_columns]
y_test = test_data[label_columns]
X_train = train_data[training_columns]
y_train = train_data[label_columns]
if debugging:
    print("X_pred:")
    display(X_pred)
    print("X_test:")
    display(X_test)
    print("y_test:")
    display(y_test)
    print("X_train:")
    display(X_train)
    print("y_train:")
    display(y_train)

KeyError: "['Ticker', 'Name', 'Sector', 'Industry', 'Date', '3M Future Change', '6M Future Change', '9M Future Change', '1Y Future Change'] not found in axis"

## Model Training

In [None]:
if train_new_model:
    opt = BayesSearchCV(
        MLPWrapper(),
        search_params,
        n_iter=iterations,
        random_state=42,
        cv=cross_validations
    )

    opt.fit(X_train, y_train.values)
    print(f"Best parameters: {opt.best_params_}")
    model = opt.best_estimator_
    
    # model = MLPRegressor(
    #     hidden_layer_sizes=hidden_layers,
    #     learning_rate="adaptive",
    #     early_stopping=True,
    #     verbose=True,
    #     tol=0.00001,
    #     n_iter_no_change=round(40000/hidden_layers[0]*4/len(hidden_layers))
    # )
    # print(f"iter_no_change: {model.n_iter_no_change}")
    # model.fit(X_train, y_train.values)

## Testing and benchmarking

In [None]:
if train_new_model:
    y_test_pred = model.predict(X_test)

    for i, target in enumerate(['3M Future Change', '6M Future Change', '9M Future Change', '1Y Future Change']):
        y_test_actual = y_test[target]
        y_test_pred_target = y_test_pred[:, i]

        plt.figure(figsize=(11, 6))
        plt.scatter(y_test_actual, y_test_pred_target, alpha=0.7, color='blue', label='Predictions')
        plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()], 
            color='red', linestyle='--', label='Perfect Fit')
        plt.title(f'Predicted vs Actual Values ({target})')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.legend()
        plt.grid(True)
        plt.show()

        mae = mean_absolute_error(y_test_actual, y_test_pred_target)
        mse = mean_squared_error(y_test_actual, y_test_pred_target)
        r2 = r2_score(y_test_actual, y_test_pred_target)

        print(f'{target} - R²: {r2:.4f}')
        print(f'{target} - MSE: {mse:.4f}')
        print(f'{target} - MAE: {mae:.4f}')

    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)

    print('\nOverall Scores:')
    print(f'Mean - R²: {r2:.4f}')
    print(f'Mean - MSE: {mse:.4f}')
    print(f'Mean - MAE: {mae:.4f}')

### Log test results

In [None]:
if train_new_model:
    test_results = pd.DataFrame({
        'R²': r2,
        'MSE': mse,
        'MAE': mae,
        'symbol_list': symbol_list,
        "iterations": iterations,
        'hidden_layer_sizes': [model.model.hidden_layer_sizes],
        'max_iter': model.model.max_iter,
        'n_iter_no_change': model.model.n_iter_no_change,
        'learning_rate': model.model.learning_rate,
        'learning_rate_init': model.model.learning_rate_init,
        'batch_size': model.model.batch_size,
        'tol': model.model.tol,
        'alpha': model.model.alpha,
        'shuffle': model.model.shuffle,
    })
    test_results.to_csv('../data/test_results.csv', mode='a', index=False)

    # save model as new best if results are better than the current one
    best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']
    if r2 > best_r2:
        print(f'Old best R²: {best_r2}')
        print(f'New best R²: {r2}')
        print('Saving new best model...')
        test_results.to_csv('../models/best_model_results.csv', mode='w', index=False)
        with open('../models/best_model.pkl','wb') as f:
            pickle.dump(model,f)
        

## Predictions on latest data

In [None]:
best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']

with open('../models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)
    print(f'Best model R²: {best_r2}')


Best model R²: 0.0289336782535032


In [None]:
df_raw = pd.read_csv('../data/earnings_data.csv')

results = []
for i in range(len(X_pred)):
    y_pred = model.predict(X_pred.iloc[[i]])[0]
    y_pred_3m, y_pred_6m, y_pred_9m, y_pred_1y = y_pred
    avg = (y_pred_3m + y_pred_6m + y_pred_9m + y_pred_1y) / 4
    results.append({
        'Ticker': df_raw.loc[i*4, 'Ticker'],
        'Name': df_raw.loc[i*4, 'Name'],
        'mean (%)': avg * 100,
        '3m (%)': y_pred_3m * 100,
        '6m (%)': y_pred_6m * 100,
        '9m (%)': y_pred_9m * 100,
        '1y (%)': y_pred_1y * 100
    })

results_df = pd.DataFrame(results)
results_df

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Asset Turnover
- Book Value Per Share
- Current Ratio
- Days Sales In Receivables
- Debt\/Equity Ratio
- ...
Feature names seen at fit time, yet now missing:
- Accounts Payable
- Accounts Receivable
- Accrued Interest Receivable
- Accumulated Depreciation
- Additional Paid In Capital
- ...


In [None]:
def predict_ticker(ticker_str):
    try:
        ticker_str = str(ticker_str).upper()
        ticker_str = ticker_str.replace("'", "")
        ticker_str = ticker_str.replace('"', "")
        row = results_df[results_df['Ticker'] == ticker_str]
        if row.empty:
            return (f"Not enough data for this stock at this moment \n Try another", "", "", "", "")
        row = row.iloc[0]
        return (
            f"{row['3m (%)']:.2f}",
            f"{row['6m (%)']:.2f}",
            f"{row['9m (%)']:.2f}",
            f"{row['1y (%)']:.2f}",
            f"{row['mean (%)']:.2f}",
        )
    except Exception as e:
        return (f"Error: {e}", "", "", "", "")

In [None]:
iface = gr.Interface(
    fn=predict_ticker,
    inputs=gr.Textbox(label="Ticker e.g. 'TSLA' or 'NVDA'"),
    outputs=[
        gr.Textbox(label="3 Month Change Prediction (%)"),
        gr.Textbox(label="6 Month Change Prediction (%)"),
        gr.Textbox(label="9 Month Change Prediction (%)"),
        gr.Textbox(label="1 Year Change Prediction (%)"),
        gr.Textbox(label="Mean Change Prediction (%)"),
    ],
    title="Stock Price Prediction Model",
    description=" "
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://d6f7567b4876962a6e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


