# Stock Prediction Model

## Dependencies

### Library Installation (if needed)

In [168]:
#! pip install yfinance pandas numpy matplotlib lightgbm torch tensorflow keras

### Importing Required Libraries

In [169]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pickle
import gradio as gr
import ast
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

### Variables

In [170]:
build_new_dataset = {'True': True, 'False': False}.get(input('Build New Dataset? (Bool)'))
if build_new_dataset:
    symbol_list = input("Symbols: ('simple', 'filtered' or 'all')") # 'simple' or 'all'. simple are the tickers from the screener notebook and all are all of the tickers in 'filtered_tickers'. You may also an 'Int' to get a % amount of random tickers from 'all'
else:
    symbol_list = 'filtered'

train_new_model = {'True': True, 'False': False}.get(input('Train New Model? (Bool)'))
if train_new_model:
    minimum_feature_threshold = 0.6
    hidden_layers = ast.literal_eval(f"({input('Hidden Layers: eg. "100, 100"')})")
    verticle_jobs = 1 #'-1' for max

if build_new_dataset or train_new_model:
    debugging = {'True': True, 'False': False}.get(input('Debug? (Bool)'))
else:
    debugging = False

## Data Preparation

In [171]:
symbols = []
if symbol_list == 'filtered':
    symbols = pd.read_csv('../data/filtered_tickers.csv')['Ticker'].tolist()
elif symbol_list == 'all':
    symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
elif symbol_list.isdigit():
    all_symbols = pd.read_csv('../data/tickers.csv')['Ticker'].tolist()
    num_symbols = max(1, round(len(all_symbols) * (int(symbol_list) / 100)))  
    symbols = np.random.choice(all_symbols, num_symbols, replace=False).tolist()
    symbols = symbols + pd.read_csv("../data/filtered_tickers.csv")["Ticker"].tolist()

symbols = symbols + pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()

symbols = pd.Series(symbols).unique()
symbols

array(['A', 'AA', 'AAN', ..., 'IQ', 'SYDB.CO', 'UBER'], dtype=object)

#### Download annual financial data

In [172]:
if build_new_dataset:
    df = pd.DataFrame()
    filtered_pd = pd.read_csv('../data/filtered_tickers.csv')
    for symbol in tqdm(symbols, total=len(symbols)):
        ticker_df = pd.DataFrame()
        ticker = yf.Ticker(symbol)
        try:
            # Latest earning data
            latest_earning_date = ticker.quarterly_cash_flow.columns.tolist()[0]
            latest_data = {'Ticker': symbol, 'Date': latest_earning_date}
            latest_data['3M Future Change'] = np.nan
            latest_data['6M Future Change'] = np.nan
            latest_data['1Y Future Change'] = np.nan
            try:
                latest_data['Sector'] = ticker.info['sector']
            except:
                latest_data['Sector'] = 'Unknown'
            try:
                latest_data['Industry'] = ticker.info['industry']
            except:
                latest_data['Industry'] = 'Unknown'
            for column in ticker.quarterly_cash_flow[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_cash_flow[latest_earning_date][column]
            for column in ticker.quarterly_balance_sheet[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_balance_sheet[latest_earning_date][column]
            for column in ticker.quarterly_income_stmt[latest_earning_date].keys().tolist():
                latest_data[column] = ticker.quarterly_income_stmt[latest_earning_date][column]
            ticker_df = pd.concat([ticker_df, pd.DataFrame([latest_data])], ignore_index=True)

            # Annual data
            earning_dates = ticker.cash_flow.columns.tolist()
            price_data = yf.download(symbol, period='10y', rounding=False, progress=False)
            cash_flow = ticker.cash_flow
            cash_flow_columns = cash_flow[earning_dates[0]].keys().tolist()
            balance_sheet = ticker.balance_sheet
            balance_sheet_columns = balance_sheet[earning_dates[0]].keys().tolist()
            income_statement = ticker.income_stmt
            income_statement_columns = income_statement[earning_dates[0]].keys().tolist()
            for earning_date in earning_dates:
                earning_date_data = {'Ticker': symbol, 'Date': earning_date}
                got_price = False
                day_offset = 0
                while(got_price==False and day_offset > -6):
                    try:           
                        earning_date_data['3M Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=13) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        earning_date_data['6M Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=26) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        earning_date_data['1Y Future Change'] = (
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset, weeks=52) , ('Close', symbol)]) / 
                            float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                        )
                        got_price = True
                    except:
                        day_offset += -1
                try:
                    earning_date_data['Sector'] = ticker.info['sector']
                except:
                    earning_date_data['Sector'] = 'Unknown'
                try:
                    earning_date_data['Industry'] = ticker.info['industry']
                except:
                    earning_date_data['Industry'] = 'Unknown'
                for column in cash_flow_columns:
                    earning_date_data[column] = cash_flow[earning_date][column]
                for column in balance_sheet_columns:
                    earning_date_data[column] = balance_sheet[earning_date][column]
                for column in income_statement_columns:
                    earning_date_data[column] = income_statement[earning_date][column]
                if got_price:
                    ticker_df = pd.concat([ticker_df, pd.DataFrame([earning_date_data])], ignore_index=True)

            if len(ticker_df) == 5 and ticker_df.isna().sum().sum() < round(352 * minimum_feature_threshold):
                if symbol not in filtered_pd['Ticker'].tolist():
                    filtered_pd = pd.concat([filtered_pd, pd.DataFrame([{'Ticker': symbol}])])
                imputer = SimpleImputer()
                for column in ticker_df.columns.drop(['Ticker', 'Date', '3M Future Change', '6M Future Change', '1Y Future Change', 'Sector', 'Industry']):
                    if not ticker_df[column].isna().all():
                        ticker_df[column] = imputer.fit_transform(ticker_df[[column]])
            else:
                if symbol in filtered_pd['Ticker'].tolist():
                    filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                    if debugging:
                        print(f'Removed {symbol} from filtered tickers. Datapoints: {ticker_df.isna().sum().sum()}, Needed: {round(352 * minimum_feature_threshold)}')
                continue
            df = pd.concat([df, ticker_df], ignore_index=True)
        except Exception as error:
            if symbol in filtered_pd['Ticker'].tolist():
                filtered_pd = filtered_pd[filtered_pd['Ticker'] != symbol]
                if debugging:
                    print(f'Removed {symbol} from filtered tickers because an exception was raised')
            continue
    filtered_pd.to_csv('../data/filtered_tickers.csv', index=False)
    df.to_csv('../data/earnings_data.csv', index=False)
else:
    df = pd.read_csv('../data/earnings_data.csv')

  7%|▋         | 408/5810 [07:41<1:19:40,  1.13it/s]
1 Failed download:
['BWP']: ValueError('Length mismatch: Expected axis has 2 elements, new values have 1 elements')
 37%|███▋      | 2147/5810 [35:55<56:12,  1.09it/s]  
1 Failed download:
['NYX']: YFPricesMissingError('possibly delisted; no price data found  (period=10y)')
 38%|███▊      | 2190/5810 [36:43<1:42:06,  1.69s/it]
1 Failed download:
['OMG']: YFPricesMissingError('possibly delisted; no price data found  (period=10y)')
 68%|██████▊   | 3964/5810 [1:11:49<54:56,  1.79s/it]  
1 Failed download:
['FFBCW']: YFPricesMissingError('possibly delisted; no price data found  (period=10y) (Yahoo error = "No data found, symbol may be delisted")')
 82%|████████▏ | 4752/5810 [1:33:44<17:01,  1.04it/s]  
1 Failed download:
['NRCIB']: ValueError('Length mismatch: Expected axis has 2 elements, new values have 1 elements')
 82%|████████▏ | 4774/5810 [1:34:45<48:12,  2.79s/it]  
1 Failed download:
['NUTR']: YFPricesMissingError('possibly deli

### Short visualisation

In [173]:
if debugging:
    display(df)

### Impution and encoding

In [174]:
imputer = SimpleImputer()
scaler = StandardScaler()
for column in df.columns.drop(['Ticker', 'Date', '3M Future Change', '6M Future Change', '1Y Future Change', 'Sector', 'Industry']):
    df[column] = imputer.fit_transform(df[[column]])
    scaler.fit(df[[column]])
    df[column] = scaler.transform(df[[column]])

le = LabelEncoder()
for column in ['Ticker', 'Sector', 'Industry']:
    df[column] = df[column].astype(str)
    le.fit(df[column])
    df[column] = le.transform(df[column])

### Splitting

In [175]:
pred_data = pd.DataFrame()
for i in range(int(len(df) / 5)):
    pred_data = pd.concat([pred_data, df.iloc[[i*5]]]) 
if debugging:
    print('Prediction Data:')
    display(pred_data)

train_data = pd.DataFrame()
for i in range(int(len(df) / 5)):
    train_data = pd.concat([train_data, df.iloc[[1+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[2+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[3+i*5]]])
    train_data = pd.concat([train_data, df.iloc[[4+i*5]]])
if debugging:
    print('Train and Test Data:')
    display(train_data)

KeyboardInterrupt: 

### Labeling

In [None]:
X_pred = pred_data.drop(['Date', '3M Future Change', '6M Future Change', '1Y Future Change'], axis=1)
X_train = train_data.drop(['Date', '3M Future Change', '6M Future Change', '1Y Future Change'], axis=1)
y_train = train_data[['3M Future Change', '6M Future Change', '1Y Future Change']]

X_train, X_test, y_train, y_test = train_test_split(
    X_train,
    y_train,
    test_size=0.15,
    shuffle=True,
)

if debugging:
    display(X_train)
    display(y_train)
    display(X_test)
    display(y_test)

Unnamed: 0,Ticker,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Issuance Of Capital Stock,Capital Expenditure,End Cash Position,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
9,6,4,0,-0.361794,0.0,0.570573,-0.527945,0.0,0.46118,-0.372268,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,10,5,8,-0.385661,0.399922,0.469206,-0.341271,-0.110542,0.460732,-0.280606,...,0.0,-7.175517e-15,-0.96178,0.0,-1.902833,0.0,0.0,1.21106,5.244044,5.244044
32,1,2,6,0.290716,0.241704,0.306262,-0.595056,0.0,-0.261296,-0.177546,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,9,1,1,-0.280268,0.0,-2.430322,0.681485,-0.487968,-0.197798,-0.143542,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2,3,2,-0.287126,0.398074,0.634677,0.0,-0.487968,0.465362,-0.266964,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,4,0,-0.358527,0.0,0.544363,-0.392861,0.0,0.452198,-0.372503,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,4,3,3,-0.232771,0.355593,0.245336,-0.248609,-0.487968,0.317317,-0.333455,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,9,1,1,-0.267025,0.0,-0.492724,-0.112239,6.372228,-0.059559,-0.153713,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,1,2,6,0.052434,0.289769,0.11316,0.604459,0.0,-0.162465,-0.229716,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,1,2,6,-0.014839,0.116289,-0.24615,-0.241036,0.0,-0.41637,-0.256266,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,3M Future Change,6M Future Change,1Y Future Change
9,-0.075638,-0.041881,-0.106959
53,-0.141903,-0.491057,-0.410208
32,-0.183009,-0.13389,-0.007779
38,0.026316,-0.354842,-0.650315
19,-0.189867,0.021889,-0.019675
6,0.768973,0.616452,1.071535
23,0.063977,0.184433,0.342553
39,-0.062239,-0.039324,0.516771
34,0.2065,0.312575,0.667386
31,-0.112342,-0.047081,-0.193228


Unnamed: 0,Ticker,Sector,Industry,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Issuance Of Capital Stock,Capital Expenditure,End Cash Position,...,Accrued Interest Receivable,Dividends Received Cfi,Change In Accrued Expense,Short Term Debt Payments,Unrealized Gain Loss On Investment Securities,Dueto Related Parties Non Current,Duefrom Related Parties Non Current,Non Current Note Receivables,Net Preferred Stock Issuance,Preferred Stock Issuance
11,7,4,0,-0.294209,0.415562,0.420154,-0.415423,-0.487968,0.184211,-0.355852,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,3,0,4,-0.375619,0.0,-0.421318,0.515247,0.522527,0.463142,-0.279856,...,0.0,-7.175517e-15,-1.125649,-4.062116,0.141988,-1.885583,-2.22915,0.0,0.0,0.0
2,5,1,9,-0.356557,0.424533,0.61994,-0.595056,0.0,0.464487,-0.383586,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,9,1,1,-0.17879,0.0,-0.167414,-0.595056,-0.487968,-0.125937,-0.159715,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,8,5,7,-0.356517,0.402625,0.635884,-0.595056,0.0,0.472541,-0.374033,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,4,3,3,-0.44574,0.383169,0.646466,-0.026195,-0.487968,-0.144722,-0.34671,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,1,9,-0.358974,0.420391,0.497332,-0.485336,0.0,0.470513,-0.383703,...,0.0,-7.175517e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,3M Future Change,6M Future Change,1Y Future Change
11,0.49819,0.678304,0.55913
47,0.373585,0.007547,-0.079245
2,0.014033,0.205079,0.459568
37,0.684202,1.125101,1.017211
26,0.022849,0.338933,0.189582
21,0.341695,0.570432,0.376127
3,-0.211179,-0.301726,-0.110513


## Model Training

In [None]:
if train_new_model:
    model = MLPRegressor(
        hidden_layer_sizes=hidden_layers,
        learning_rate="adaptive",
        early_stopping=True,
        verbose=True,
        n_iter_no_change=10
    )
    model.fit(X_train, y_train)

## Testing and benchmarking

In [None]:
if train_new_model:
    y_test_pred = model.predict(X_test)

    for i, target in enumerate(['3M Future Change', '6M Future Change', '1Y Future Change']):
        y_test_actual = y_test[target]
        y_test_pred_target = y_test_pred[:, i]

        plt.figure(figsize=(11, 6))
        plt.scatter(y_test_actual, y_test_pred_target, alpha=0.7, color='blue', label='Predictions')
        plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()], 
            color='red', linestyle='--', label='Perfect Fit')
        plt.title(f'Predicted vs Actual Values ({target})')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.legend()
        plt.grid(True)
        plt.show()

        mae = mean_absolute_error(y_test_actual, y_test_pred_target)
        mse = mean_squared_error(y_test_actual, y_test_pred_target)
        r2 = r2_score(y_test_actual, y_test_pred_target)

        print(f'{target} - R²: {r2:.4f}')
        print(f'{target} - MSE: {mse:.4f}')
        print(f'{target} - MAE: {mae:.4f}')

    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)

    print('\nOverall Scores:')
    print(f'Mean - R²: {r2:.4f}')
    print(f'Mean - MSE: {mse:.4f}')
    print(f'Mean - MAE: {mae:.4f}')

### Log test results

In [None]:
if train_new_model:
    test_results = pd.DataFrame({
        'R²': r2,
        'MSE': mse,
        'MAE': mae,
        'symbol_list': symbol_list,
        'hidden_layer_sizes': [model.hidden_layer_sizes],
        'max_iter': model.max_iter,
        'n_iter_no_change': model.n_iter_no_change,
        'learning_rate': model.learning_rate,
        'learning_rate_init': model.learning_rate_init,
        'batch_size': model.batch_size,
        'tol': model.tol,
        'alpha': model.alpha,
        'shuffle': model.shuffle,
    })
    test_results.to_csv('../data/test_results.csv', mode='a', index=False)

    # save model as new best if results are better than the current one
    best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']
    if r2 > best_r2:
        print(f'Old best R²: {best_r2}')
        print(f'New best R²: {r2}')
        print('Saving new best model...')
        test_results.to_csv('../models/best_model_results.csv', mode='w', index=False)
        with open('../models/best_model.pkl','wb') as f:
            pickle.dump(model,f)
        

## Predictions on latest data

In [None]:
best_r2 = pd.read_csv('../models/best_model_results.csv').loc[0, 'R²']

with open('../models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)
    print(f'Best model R²: {best_r2}')


Best model R²: -0.2386041650746659


In [None]:
df_raw = pd.read_csv('../data/earnings_data.csv')

results = []
for i in range(len(X_pred)):
    y_pred = model.predict(X_pred.iloc[[i]])[0]
    y_pred_3m, y_pred_6m, y_pred_1y = y_pred
    avg = (y_pred_3m + y_pred_6m + y_pred_1y) / 3
    results.append({
        'Ticker': df_raw.loc[i*5, 'Ticker'],
        'mean (%)': avg * 100,
        '3m (%)': y_pred_3m * 100,
        '6m (%)': y_pred_6m * 100,
        '1y (%)': y_pred_1y * 100
    })

results_df = pd.DataFrame(results)
results_df

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Amortization Of Securities
- Cash Cash Equivalents And Federal Funds Sold
- Cash Flow From Discontinued Operation
- Cash Flowsfromusedin Operating Activities Direct
- Cash From Discontinued Financing Activities
- ...


In [None]:
def predict_ticker(ticker_str):
    try:
        ticker_str = str(ticker_str).upper()
        ticker_str = ticker_str.replace("'", "")
        ticker_str = ticker_str.replace('"', "")
        row = results_df[results_df['Ticker'] == ticker_str]
        if row.empty:
            return (f"The author was stupid and forgot to cover this obvious, famous stock. Try another.", "", "", "")
        row = row.iloc[0]
        return (
            f"{row['3m (%)']:.2f}",
            f"{row['6m (%)']:.2f}",
            f"{row['1y (%)']:.2f}",
            f"{row['mean (%)']:.2f}",
        )
    except Exception as e:
        return (f"Error: {e}", "", "", "")

In [None]:
iface = gr.Interface(
    fn=predict_ticker,
    inputs=gr.Textbox(label="Ticker (e.g. 'TSLA' or 'MATAS.CO')"),
    outputs=[
        gr.Textbox(label="3 Month Change Prediction (%)"),
        gr.Textbox(label="6 Month Change Prediction (%)"),
        gr.Textbox(label="1 Year Change Prediction (%)"),
        gr.Textbox(label="Mean Change Prediction (%)"),
    ],
    title="Stock Price Prediction Model",
    description=" "
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://61172e97ecb196af91.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


