# Stock Prediction Model

## Dependencies

### Library Installation (if needed)

In [75]:
# Install libraries the first time
#! pip install -U yfinance pandas pathlib numpy 

### Importing Required Libraries

In [89]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pathlib as Path
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## Data Preparation

In [77]:
symbols = pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()
symbols = pd.Series(symbols).unique()
symbols

array(['MATAS.CO', 'TRIFOR.CO', 'QQ.L', 'RNMBY', 'SAABF', 'BCKIY',
       'BAESY', 'IVSO.ST', 'NSKFF', 'GMAB', 'GN.CO', 'NVDA', 'LLY',
       'DANSKE.CO', 'CARL-B.CO', 'MAERSK-B.CO', 'RBREW.CO', 'ISS.CO',
       'DSV.CO', 'SCHO.CO', 'NETC.CO', 'JYSK.CO', 'ABBN.SW', 'TER',
       'PARKEN.CO', 'NFLX', 'TRMD-A.CO', 'STG.CO', 'NOVO-B.CO', 'EQNR',
       'NKT.CO', 'NSIS-B.CO', 'KCC.OL'], dtype=object)

#### Download and Process Historical Data

In [78]:
#symbols = ['AAPL']

In [79]:
test = yf.download('AAPL',period='10y')
test

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015-04-06,28.440775,28.476508,27.766326,27.797592,148776000
2015-04-07,28.141520,28.612739,28.134820,28.505543,140049200
2015-04-08,28.049955,28.228618,27.909260,28.105787,149316800
2015-04-09,28.264345,28.268813,27.840024,28.105783,129936000
2015-04-10,28.384951,28.409517,27.974028,28.128123,160752000
...,...,...,...,...,...
2025-03-31,222.130005,225.619995,216.229996,217.009995,65299300
2025-04-01,223.190002,223.679993,218.899994,219.809998,36412700
2025-04-02,223.889999,225.190002,221.020004,221.320007,35905900
2025-04-03,203.190002,207.490005,201.250000,205.539993,103419000


In [80]:
float(test.loc['2025-03-18', ('Close', 'AAPL')])

212.69000244140625

In [81]:
test = yf.Ticker("AAPL").history()
test

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-03-05 00:00:00-05:00,235.419998,236.550003,229.229996,235.740005,47227600,0.0,0.0
2025-03-06 00:00:00-05:00,234.440002,237.860001,233.160004,235.330002,45170400,0.0,0.0
2025-03-07 00:00:00-05:00,235.110001,241.369995,234.759995,239.070007,46273600,0.0,0.0
2025-03-10 00:00:00-04:00,235.539993,236.160004,224.220001,227.479996,72071200,0.0,0.0
2025-03-11 00:00:00-04:00,223.809998,225.839996,217.449997,220.839996,76137400,0.0,0.0
2025-03-12 00:00:00-04:00,220.139999,221.75,214.910004,216.979996,62547500,0.0,0.0
2025-03-13 00:00:00-04:00,215.949997,216.839996,208.419998,209.679993,61368300,0.0,0.0
2025-03-14 00:00:00-04:00,211.25,213.949997,209.580002,213.490005,60107600,0.0,0.0
2025-03-17 00:00:00-04:00,213.309998,215.220001,209.970001,214.0,48073400,0.0,0.0
2025-03-18 00:00:00-04:00,214.160004,215.149994,211.490005,212.690002,42432400,0.0,0.0


In [82]:
'2024-03-31 00:00:00-04:00'

'2024-03-31 00:00:00-04:00'

In [None]:
print_errors=True

df = pd.DataFrame()
for symbol in symbols:
    ticker = yf.Ticker(symbol)
    earning_dates = ticker.cash_flow.columns.tolist()
    cash_flow = ticker.cash_flow
    cash_flow_columns = cash_flow[earning_dates[0]].keys().tolist()
    balance_sheet = ticker.balance_sheet
    balance_sheet_columns = balance_sheet[earning_dates[0]].keys().tolist()
    income_statement = ticker.income_stmt
    income_statement_columns = income_statement[earning_dates[0]].keys().tolist()

    for earning_date in earning_dates:
        try:
            current_ticker_data = {'Ticker': symbol, 'Date': earning_date}

            price_data = yf.download(symbol, period='5y', rounding=True, progress=False)
            current_ticker_data['Open Price'] = float(price_data.loc[earning_date, ('Open', symbol)])
            current_ticker_data['Close Price'] = float(price_data.loc[earning_date, ('Close', symbol)])

            for column in cash_flow_columns:
                current_ticker_data[column] = cash_flow[earning_date][column]

            for column in balance_sheet_columns:
                current_ticker_data[column] = balance_sheet[earning_date][column]

            for column in income_statement_columns:
                current_ticker_data[column] = income_statement[earning_date][column]

            df = pd.concat([df, pd.DataFrame([current_ticker_data])], ignore_index=True)
        except Exception as e:
            if print_errors:
                print(f'Error for {symbol}: {e}')

df.to_csv('../data/earnings_data.csv', index=False)

Error for MATAS.CO: Timestamp('2024-03-31 00:00:00')
Error for MATAS.CO: Timestamp('2022-03-31 00:00:00')
Error for MATAS.CO: Timestamp('2021-03-31 00:00:00')
Error for TRIFOR.CO: Timestamp('2024-12-31 00:00:00')
Error for TRIFOR.CO: Timestamp('2023-12-31 00:00:00')
Error for TRIFOR.CO: Timestamp('2022-12-31 00:00:00')
Error for TRIFOR.CO: Timestamp('2021-12-31 00:00:00')
Error for TRIFOR.CO: Timestamp('2020-12-31 00:00:00')
Error for QQ.L: Timestamp('2024-03-31 00:00:00')
Error for QQ.L: Timestamp('2022-03-31 00:00:00')
Error for QQ.L: Timestamp('2021-03-31 00:00:00')
Error for RNMBY: Timestamp('2023-12-31 00:00:00')
Error for RNMBY: Timestamp('2022-12-31 00:00:00')
Error for RNMBY: Timestamp('2021-12-31 00:00:00')
Error for RNMBY: Timestamp('2020-12-31 00:00:00')
Error for SAABF: Timestamp('2023-12-31 00:00:00')
Error for SAABF: Timestamp('2022-12-31 00:00:00')
Error for SAABF: Timestamp('2021-12-31 00:00:00')
Error for SAABF: Timestamp('2020-12-31 00:00:00')
Error for BCKIY: Timesta

### Short visualisation

In [87]:
df

Unnamed: 0,Ticker,Date,Open Price,Close Price,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,...,Liabilities Heldfor Sale Non Current,Earnings From Equity Interest Net Of Tax,Gain On Sale Of Ppe,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Short Term Debt Issuance,Unrealized Gain Loss,Commercial Paper,Dueto Related Parties Current,Duefrom Related Parties Current
0,MATAS.CO,2023-03-31,80.43,81.01,424000000.0,0.0,-126000000.0,0.0,-254000000.0,37000000.0,...,,,,,,,,,,
1,QQ.L,2023-03-31,324.66,324.66,97000000.0,-800000.0,-257900000.0,481100000.0,-109000000.0,151200000.0,...,,,,,,,,,,
2,RNMBY,2024-12-31,126.62,126.55,988000000.0,,-773000000.0,793000000.0,-732000000.0,1184000000.0,...,,,,,,,,,,
3,SAABF,2024-12-31,21.27,21.27,1893000000.0,,-678000000.0,331000000.0,-4839000000.0,2843000000.0,...,,,,,,,,,,
4,BCKIY,2023-03-31,3.65,3.65,136200000.0,0.0,-972800000.0,416600000.0,-125100000.0,429500000.0,...,,,,,,,,,,
5,BAESY,2024-12-31,58.0,57.16,2762000000.0,-555000000.0,-3794000000.0,6933000000.0,-1163000000.0,3378000000.0,...,,,,,,,,,,
6,NSKFF,2024-12-31,113.3,113.3,11498000000.0,,-500000000.0,0.0,-2246000000.0,14293000000.0,...,,,,,,,,,,
7,GMAB,2024-12-31,20.77,20.87,7467000000.0,-3879000000.0,-60000000.0,,-304000000.0,9858000000.0,...,,,,,,,,,,
8,NVDA,2025-01-31,123.77,120.06,60853000000.0,-33706000000.0,-1250000000.0,,-3236000000.0,8589000000.0,...,,,,,,,,,,
9,NVDA,2024-01-31,61.42,61.5,27021000000.0,-9533000000.0,-1250000000.0,0.0,-1069000000.0,7280000000.0,...,,,,,,,,,,


### Splitting

In [92]:
X = df.drop('Open Price', axis=1)
y = df['Open Price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## Model Training

In [None]:
if trainNewModel:
    learn = tabular_learner(dls, metrics=[rmse, mae])

    # Learning rate finder
    lr_find_results = learn.lr_find(suggest_funcs=(minimum, steep))

    # Debugging information
    print(f"Learning rate finder results: {lr_find_results}")

    # Extract learning rates
    lr_min, lr_steep = lr_find_results

    # Check if learning rates are valid
    if lr_min is None or lr_steep is None or lr_min == 0 or lr_steep == 0:
        raise ValueError("Learning rate finder did not return valid learning rates.")

    # Train
    print(f"Training for {epochs} epochs...")
    learn.fit_one_cycle(epochs, lr_max=lr_steep)
    print("Model training complete")

    learn.export(modelFolder / f'{modelName}.pkl')

## Testing and benchmarking

In [None]:
def logEvaluation(model_name, mae, rmse, r2, model_folder, test_tickers):
    """Log evaluation metrics to CSV file"""
    log_file = model_folder / "modelEvaluations.csv"
    
    new_entry_df = pd.DataFrame([{
        "Model Name": model_name,
        "Timestamp": datetime.now().strftime('%Y-%m-%d %H:%M'),
        "MAE": f'{mae:.3f}',
        "RMSE": f'{rmse:.3f}',
        "R2": f'{r2:.3f}',
        "Epochs": epochs,
        "Training Size": trainingSize,
        "Training Rows": trainingRowAmount,
        "Test Size": len(test_tickers),
        "Cat Names": catNames,
        "Cont Names": contNames,
    }])
    
    try:
        log_df = pd.read_csv(log_file)
        log_df = pd.concat([log_df, new_entry_df], ignore_index=True)
    except FileNotFoundError:
        log_df = new_entry_df
        
    log_df.to_csv(log_file, index=False)
    print(f"Logged evaluation results to {log_file}")

def plotResults(results_df, model_name, model_folder):
    """Create and save visualization plots using all data points."""
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 1, 1)
    plt.scatter(results_df['Actual'], results_df['Predicted'], alpha=0.7, label='Predictions')
    min_val = min(results_df['Actual'].min(), results_df['Predicted'].min())
    max_val = max(results_df['Actual'].max(), results_df['Predicted'].max())
    plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Perfect Prediction')
    plt.title(f'Predicted vs. Actual Returns - {model_name}', fontsize=14)
    plt.xlabel('Actual Returns')
    plt.ylabel('Predicted Returns')
    plt.legend()
    plt.grid(alpha=0.5)
    
    plt.subplot(2, 1, 2)
    plt.scatter(results_df['Predicted'], results_df['Residual'], alpha=0.7)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title('Residual Plot')
    plt.xlabel('Predicted Returns')
    plt.ylabel('Residual')
    plt.grid(alpha=0.5)
    
    plt.tight_layout()
    plt.show()

if trainNewModel:
    nonTrainingTickers = list(set(tickers) - set(trainingTickers))
    validTestData = []
    attempted_tickers = set()
    attempts = 0

    if testSize * 4 <= len(tickers):  
        max_attempts = testSize * 4 # Prevent infinite loops
    else:
        max_attempts = len(tickers)

    # Keep trying until we reach testSize or exhaust attempts
    while len(validTestData) < testSize and attempts < max_attempts:
        # Get a new ticker we haven't tried yet
        remaining_tickers = [t for t in nonTrainingTickers if t not in attempted_tickers]
        if not remaining_tickers:  # If all tried, reset attempted list
            attempted_tickers = set()
            remaining_tickers = nonTrainingTickers
            
        ticker = np.random.choice(remaining_tickers)
        attempted_tickers.add(ticker)
        attempts += 1

        # Fetch and validate data
        data = getTickerDataFrom1YrAgo(ticker)
        if not data.empty and not data[['EV/EBIT', 'ROIC']].isna().any().any():
            validTestData.append(data)

    if not validTestData:
        raise ValueError("No valid test data collected after multiple attempts")
        
    # Trim to exact testSize if we collected more
    validTestData = validTestData[:testSize]  
    combinedTestData = pd.concat(validTestData, ignore_index=True)

    # Clean data
    test_data_clean = combinedTestData.dropna(subset=['EV/EBIT', 'ROIC', 'Future Year Change'])
    
    if test_data_clean.empty:
        raise ValueError("No valid test data after cleaning NaN values")

    # Create test dataloader
    test_dl = learn.dls.test_dl(test_data_clean)
    preds, targs = learn.get_preds(dl=test_dl)
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'Predicted': preds.numpy().flatten(),
        'Actual': targs.numpy().flatten()
    })
    results_df['Residual'] = results_df['Actual'] - results_df['Predicted']

    # Calculate metrics
    mae = np.mean(np.abs(results_df['Residual']))
    rmse = np.sqrt(np.mean(results_df['Residual']**2))
    r2 = 1 - (np.sum(results_df['Residual']**2) / np.sum((results_df['Actual'] - results_df['Actual'].mean())**2))

    # Log and plot
    logEvaluation(modelName, mae, rmse, r2, modelFolder, test_data_clean['Ticker'].unique())
    plotResults(results_df, modelName, modelFolder)

    # Show collection stats
    print(f"Collected {len(validTestData)} valid test tickers (target: {testSize})")
    if attempts >= max_attempts:
        print(f"Warning: Reached max attempts ({max_attempts}). Some invalid tickers may remain.")  
    print(f"\nEvaluation Results:")
    print(f"MAE: {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R²: {r2:.3f}")

## Predictions

In [None]:
print('Model files in modelFolder:')
for file in modelFolder.glob('*.pkl'):
    print(file.name)

Model files in modelFolder:
stockScreenerV1.0.pkl
stockScreenerV1.1.pkl
stockScreenerV1.10.pkl
stockScreenerV1.2.pkl
stockScreenerV1.3.pkl
stockScreenerV1.4.pkl
stockScreenerV1.5.pkl
stockScreenerV1.6.pkl
stockScreenerV1.7.pkl
stockScreenerV1.8.pkl
stockScreenerV1.9.pkl
stockScreenerV2.0.pkl
stockScreenerV2.1.pkl
stockScreenerV2.2.pkl


In [None]:
evaluations = pd.read_csv(modelFolder / 'modelEvaluations.csv')
bestModel = evaluations.sort_values('MAE', ascending=True).iloc[0]
bestModel.head()

Model Name    stockScreenerV1.7
Timestamp      2025-01-27 08:45
MAE                       0.328
RMSE                      0.739
R2                        0.077
Name: 7, dtype: object

### Load model

In [None]:
if os.name == 'nt':
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
else:
    pathlib.WindowsPath = pathlib.PosixPath

importedModel = Path(f"{bestModel['Model Name']}.pkl") # Change this if you want to try other models
learn = load_learner(modelFolder / importedModel)

In [None]:
#predictionTarget = '95%'

In [None]:
if predictionTarget != 'None':
    if predictionTarget == 'ALL':
        predictionTickers = tickers
    elif predictionTarget.endswith('%'):
        percentage = float(predictionTarget.strip('%')) / 100
        num_tickers = int(len(tickers) * percentage)
        predictionTickers = np.random.choice(tickers, size=num_tickers, replace=False).tolist()
    else:
        predictionTickers = [predictionTarget]

    # Fetch data for prediction tickers
    dfPrediction = pd.concat([getTickerData(ticker) for ticker in predictionTickers], ignore_index=True)

    # Ensure dfPrediction is a DataFrame
    if isinstance(dfPrediction, dict):
        dfPrediction = pd.DataFrame([dfPrediction])

    # Drop rows with NaN values in 'EV/EBIT' or 'ROIC'
    dfPrediction = dfPrediction.dropna(subset=['EV/EBIT', 'ROIC'])

    # Create test dataloader
    dl = learn.dls.test_dl(dfPrediction)
    dfPrediction.head()

    # Make predictions
    prediction = learn.get_preds(dl=dl)
    adr_df = pd.read_csv(testFolder / 'filteredTickers.csv')
    company_dict = dict(zip(adr_df['Ticker'], adr_df['Company']))

    if predictionTarget == 'ALL' or predictionTarget.endswith('%'):
        sorted_predictions = sorted(zip(predictionTickers, prediction[0]), key=lambda x: x[1], reverse=True)
        print(f"Got predictions for {len(sorted_predictions)} tickers, expected: {len(predictionTickers)}")
        print(f"Prediction for best performing tickers:")
        for symbol, pred in sorted_predictions:
            company_name = company_dict.get(symbol, 'Unknown')
            print(f"{symbol} ({company_name}): {pred[0].item() * 100:.2f}%")
    else:
        company_name = company_dict.get(predictionTarget, 'Unknown')
        print(f"Prediction for {predictionTarget} ({company_name}):")
        print(f"{prediction[0][0][0].item() * 100:.2f}%")
    print("Free money?!")

NameError: name 'getTickerData' is not defined