## Setup

### Library Installation (if needed)

In [13]:
# Install libraries the first time
#! pip install -U yfinance pandas pathlib numpy 

### Importing Required Libraries

In [14]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pathlib as Path

### Variables

## Data Preparation

In [15]:
symbols = pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()
symbols = pd.Series(symbols).unique()
symbols

array(['MATAS.CO', 'TRIFOR.CO', 'QQ.L', 'RNMBY', 'SAABF', 'BCKIY',
       'BAESY', 'IVSO.ST', 'NSKFF', 'GMAB', 'GN.CO', 'NVDA', 'LLY',
       'DANSKE.CO', 'CARL-B.CO', 'MAERSK-B.CO', 'RBREW.CO', 'ISS.CO',
       'DSV.CO', 'SCHO.CO', 'NETC.CO', 'JYSK.CO', 'ABBN.SW', 'TER',
       'PARKEN.CO', 'NFLX', 'TRMD-A.CO', 'STG.CO', 'NOVO-B.CO', 'EQNR',
       'NKT.CO', 'NSIS-B.CO', 'KCC.OL'], dtype=object)

#### Download and Process Historical Data

In [16]:
#symbols = ['AAPL']

In [17]:
df = pd.DataFrame()

for symbol in symbols:
    print(f'{symbol}...')
    ticker = yf.Ticker(symbol)

    earning_dates = ticker.cashflow.columns.tolist()
    cashflow = ticker.cashflow
    cashflow_columns = cashflow[earning_dates[0]].keys().tolist()
    balancesheet = ticker.balancesheet
    balancesheet_columns = balancesheet[earning_dates[0]].keys().tolist()
    incomestatement = ticker.incomestmt
    incomestatement_columns = incomestatement[earning_dates[0]].keys().tolist()

    for earning_date in earning_dates:
        row_data = {'Ticker': symbol, 'Date': earning_date}

        try:
            for column in cashflow_columns:
                row_data[column] = cashflow[earning_date][column]

            for column in balancesheet_columns:
                row_data[column] = balancesheet[earning_date][column]

            for column in incomestatement_columns:
                row_data[column] = incomestatement[earning_date][column]
        except Exception as e:
            print(f"Error for {symbol} on {earning_date}: {e}")

        try:
            start_date = (earning_date - pd.Timedelta(days=1)).strftime("%Y-%m-%d")
            end_date = (earning_date + pd.Timedelta(days=1)).strftime("%Y-%m-%d")

            price_data = ticker.history(start=start_date, end=end_date)
            if not price_data.empty:
                # Use the closest available price
                row_data['Close Price'] = price_data['Close'].iloc[-1]
            else:
                row_data['Close Price'] = None
        except Exception as e:
            print(f"Error fetching price for {symbol} on {earning_date}: {e}")
            row_data['Close Price'] = None

        df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)

df.to_csv('../data/earnings_data.csv', index=False)

MATAS.CO...


$MATAS.CO: possibly delisted; no price data found  (1d 2024-03-30 -> 2024-04-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


TRIFOR.CO...


$TRIFOR.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
$TRIFOR.CO: possibly delisted; no price data found  (1d 2020-12-30 -> 2021-01-01) (Yahoo error = "Data doesn't exist for startDate = 1609282800, endDate = 1609455600")
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


QQ.L...


$QQ.L: possibly delisted; no price data found  (1d 2024-03-30 -> 2024-04-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


RNMBY...


$RNMBY: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


SAABF...


$SAABF: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


BCKIY...


$BCKIY: possibly delisted; no price data found  (1d 2024-03-30 -> 2024-04-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


BAESY...


$BAESY: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


IVSO.ST...


$IVSO.ST: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


NSKFF...


$NSKFF: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for NSKFF on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
GMAB...


$GMAB: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


GN.CO...


$GN.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for GN.CO on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
NVDA...


$NVDA: possibly delisted; no price data found  (1d 2021-01-30 -> 2021-02-01)


Error for NVDA on 2021-01-31 00:00:00: Timestamp('2021-01-31 00:00:00')
LLY...


  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
$LLY: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


DANSKE.CO...


$DANSKE.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


CARL-B.CO...


$CARL-B.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


MAERSK-B.CO...


$MAERSK-B.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


RBREW.CO...


$RBREW.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


ISS.CO...


$ISS.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for ISS.CO on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
DSV.CO...


$DSV.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


SCHO.CO...


$SCHO.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


NETC.CO...


$NETC.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for NETC.CO on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
JYSK.CO...


$JYSK.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


ABBN.SW...


$ABBN.SW: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


TER...


$TER: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


PARKEN.CO...


$PARKEN.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


NFLX...


$NFLX: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for NFLX on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
TRMD-A.CO...


$TRMD-A.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


STG.CO...


$STG.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


NOVO-B.CO...


$NOVO-B.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for NOVO-B.CO on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
EQNR...


$EQNR: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


NKT.CO...


$NKT.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


Error for NKT.CO on 2020-12-31 00:00:00: Timestamp('2020-12-31 00:00:00')
NSIS-B.CO...


$NSIS-B.CO: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


KCC.OL...


$KCC.OL: possibly delisted; no price data found  (1d 2023-12-30 -> 2024-01-01)
  df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)


In [18]:
tickers = pd.read_csv(testFolder / 'filteredTickers.csv')['Ticker']
trainingTickers = np.random.choice(tickers, size=trainingSize, replace=False)
trainingRowAmount = len(pd.read_csv(testFolder / 'filteredTickers.csv'))

if getNewData:
    histData = pd.DataFrame()
    valid_tickers = []
    
    for ticker in trainingTickers:
        print(f"Processing {ticker}...")
        try:
            data = calculateFutureYearChange(ticker, timeFrame)
            if not data.empty:
                data['Ticker'] = ticker
                data['Industry'] = yf.Ticker(ticker).info.get('industry', 'Unknown')
                data['Date'] = pd.to_datetime(data['Date']).dt.tz_localize(None)
                
                # Enrich individual ticker data first
                ticker_data = enrichDataWithMetrics(data)
                histData = pd.concat([histData, ticker_data])
                
                # Check if metrics were added
                if 'ROIC' not in ticker_data.columns:
                    print(f"WARNING: Failed to add metrics for {ticker}")
                
                valid_tickers.append(ticker)
            else:
                print(f"Skipped {ticker} - insufficient data")
        except Exception as e:
            print(f"Error processing {ticker}: {e}")

    print(f"\nColumns in final dataset: {histData.columns.tolist()}")
    
    if not histData.empty:
        histData = enrichDataWithMetrics(histData)
        histData.to_csv(dataFolder / trainingData, index=True)
        # Verify no future targets leaked to past dates
        latest_date = pd.to_datetime(histData['Date']).max()
        if 'Future Year Change' in histData.columns:
            target_dates = histData[histData['Future Year Change'].notnull()]['Date']
            if any(pd.to_datetime(target_dates) > latest_date):
                raise ValueError("CRITICAL: Analyst targets contain future dates!")
        trainingRowAmount = len(histData)
        print(f"Saved training data with {trainingRowAmount} rows")
    else:
        print("Warning: No data collected - check your tickers list")

NameError: name 'testFolder' is not defined

### Short visualisation

In [None]:
if getNewData:
    histData.head(5)

## Model Training

In [None]:
if trainNewModel:
    TRAINING_CUTOFF = pd.to_datetime('2023-01-01')

    df = pd.read_csv(dataFolder / trainingData)
    dfCleaned = df.dropna(subset=['EV/EBIT', 'ROIC']).copy()

    # Convert 'Date' to datetime, parse UTC-aware dates, then make naive
    dfCleaned['Date'] = pd.to_datetime(dfCleaned['Date'], errors='coerce', utc=True).dt.tz_convert(None)
    dfCleaned = dfCleaned.dropna(subset=['Date'])

    # Clean 'EV/EBIT' and reset index
    dfCleaned['EV/EBIT'] = dfCleaned['EV/EBIT'].replace([np.inf, -np.inf], np.nan)
    dfCleaned = dfCleaned.dropna(subset=['EV/EBIT', 'ROIC'])
    dfCleaned = dfCleaned.reset_index(drop=True)
    dfCleaned.to_csv(dataFolder / trainingData, index=False)

    # Check for empty data
    if dfCleaned.empty:
        raise ValueError("The cleaned DataFrame is empty.")

    # Create splits with valid indices
    train_mask = dfCleaned['Date'] < TRAINING_CUTOFF
    valid_mask = ~train_mask
    splits = (list(dfCleaned[train_mask].index), list(dfCleaned[valid_mask].index))

    if not splits[0] or not splits[1]:
        raise ValueError("Empty training or validation split.")

    # Proceed with TabularPandas
    to = TabularPandas(
        dfCleaned, 
        procs=[Categorify, FillMissing, Normalize],
        y_names=yNames,
        cat_names=catNames, 
        cont_names=contNames,
        splits=splits
    )

    dls = to.dataloaders(bs=batchSize)

In [None]:
if trainNewModel:
    learn = tabular_learner(dls, metrics=[rmse, mae])

    # Learning rate finder
    lr_find_results = learn.lr_find(suggest_funcs=(minimum, steep))

    # Debugging information
    print(f"Learning rate finder results: {lr_find_results}")

    # Extract learning rates
    lr_min, lr_steep = lr_find_results

    # Check if learning rates are valid
    if lr_min is None or lr_steep is None or lr_min == 0 or lr_steep == 0:
        raise ValueError("Learning rate finder did not return valid learning rates.")

    # Train
    print(f"Training for {epochs} epochs...")
    learn.fit_one_cycle(epochs, lr_max=lr_steep)
    print("Model training complete")

    learn.export(modelFolder / f'{modelName}.pkl')

## Testing and benchmarking

In [None]:
def logEvaluation(model_name, mae, rmse, r2, model_folder, test_tickers):
    """Log evaluation metrics to CSV file"""
    log_file = model_folder / "modelEvaluations.csv"
    
    new_entry_df = pd.DataFrame([{
        "Model Name": model_name,
        "Timestamp": datetime.now().strftime('%Y-%m-%d %H:%M'),
        "MAE": f'{mae:.3f}',
        "RMSE": f'{rmse:.3f}',
        "R2": f'{r2:.3f}',
        "Epochs": epochs,
        "Training Size": trainingSize,
        "Training Rows": trainingRowAmount,
        "Test Size": len(test_tickers),
        "Cat Names": catNames,
        "Cont Names": contNames,
    }])
    
    try:
        log_df = pd.read_csv(log_file)
        log_df = pd.concat([log_df, new_entry_df], ignore_index=True)
    except FileNotFoundError:
        log_df = new_entry_df
        
    log_df.to_csv(log_file, index=False)
    print(f"Logged evaluation results to {log_file}")

def plotResults(results_df, model_name, model_folder):
    """Create and save visualization plots using all data points."""
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 1, 1)
    plt.scatter(results_df['Actual'], results_df['Predicted'], alpha=0.7, label='Predictions')
    min_val = min(results_df['Actual'].min(), results_df['Predicted'].min())
    max_val = max(results_df['Actual'].max(), results_df['Predicted'].max())
    plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Perfect Prediction')
    plt.title(f'Predicted vs. Actual Returns - {model_name}', fontsize=14)
    plt.xlabel('Actual Returns')
    plt.ylabel('Predicted Returns')
    plt.legend()
    plt.grid(alpha=0.5)
    
    plt.subplot(2, 1, 2)
    plt.scatter(results_df['Predicted'], results_df['Residual'], alpha=0.7)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title('Residual Plot')
    plt.xlabel('Predicted Returns')
    plt.ylabel('Residual')
    plt.grid(alpha=0.5)
    
    plt.tight_layout()
    plt.show()

if trainNewModel:
    nonTrainingTickers = list(set(tickers) - set(trainingTickers))
    validTestData = []
    attempted_tickers = set()
    attempts = 0

    if testSize * 4 <= len(tickers):  
        max_attempts = testSize * 4 # Prevent infinite loops
    else:
        max_attempts = len(tickers)

    # Keep trying until we reach testSize or exhaust attempts
    while len(validTestData) < testSize and attempts < max_attempts:
        # Get a new ticker we haven't tried yet
        remaining_tickers = [t for t in nonTrainingTickers if t not in attempted_tickers]
        if not remaining_tickers:  # If all tried, reset attempted list
            attempted_tickers = set()
            remaining_tickers = nonTrainingTickers
            
        ticker = np.random.choice(remaining_tickers)
        attempted_tickers.add(ticker)
        attempts += 1

        # Fetch and validate data
        data = getTickerDataFrom1YrAgo(ticker)
        if not data.empty and not data[['EV/EBIT', 'ROIC']].isna().any().any():
            validTestData.append(data)

    if not validTestData:
        raise ValueError("No valid test data collected after multiple attempts")
        
    # Trim to exact testSize if we collected more
    validTestData = validTestData[:testSize]  
    combinedTestData = pd.concat(validTestData, ignore_index=True)

    # Clean data
    test_data_clean = combinedTestData.dropna(subset=['EV/EBIT', 'ROIC', 'Future Year Change'])
    
    if test_data_clean.empty:
        raise ValueError("No valid test data after cleaning NaN values")

    # Create test dataloader
    test_dl = learn.dls.test_dl(test_data_clean)
    preds, targs = learn.get_preds(dl=test_dl)
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'Predicted': preds.numpy().flatten(),
        'Actual': targs.numpy().flatten()
    })
    results_df['Residual'] = results_df['Actual'] - results_df['Predicted']

    # Calculate metrics
    mae = np.mean(np.abs(results_df['Residual']))
    rmse = np.sqrt(np.mean(results_df['Residual']**2))
    r2 = 1 - (np.sum(results_df['Residual']**2) / np.sum((results_df['Actual'] - results_df['Actual'].mean())**2))

    # Log and plot
    logEvaluation(modelName, mae, rmse, r2, modelFolder, test_data_clean['Ticker'].unique())
    plotResults(results_df, modelName, modelFolder)

    # Show collection stats
    print(f"Collected {len(validTestData)} valid test tickers (target: {testSize})")
    if attempts >= max_attempts:
        print(f"Warning: Reached max attempts ({max_attempts}). Some invalid tickers may remain.")  
    print(f"\nEvaluation Results:")
    print(f"MAE: {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R²: {r2:.3f}")

## Predictions

In [None]:
print('Model files in modelFolder:')
for file in modelFolder.glob('*.pkl'):
    print(file.name)

Model files in modelFolder:
stockScreenerV1.0.pkl
stockScreenerV1.1.pkl
stockScreenerV1.10.pkl
stockScreenerV1.2.pkl
stockScreenerV1.3.pkl
stockScreenerV1.4.pkl
stockScreenerV1.5.pkl
stockScreenerV1.6.pkl
stockScreenerV1.7.pkl
stockScreenerV1.8.pkl
stockScreenerV1.9.pkl
stockScreenerV2.0.pkl
stockScreenerV2.1.pkl
stockScreenerV2.2.pkl


In [None]:
evaluations = pd.read_csv(modelFolder / 'modelEvaluations.csv')
bestModel = evaluations.sort_values('MAE', ascending=True).iloc[0]
bestModel.head()

Model Name    stockScreenerV1.7
Timestamp      2025-01-27 08:45
MAE                       0.328
RMSE                      0.739
R2                        0.077
Name: 7, dtype: object

### Load model

In [None]:
if os.name == 'nt':
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
else:
    pathlib.WindowsPath = pathlib.PosixPath

importedModel = Path(f"{bestModel['Model Name']}.pkl") # Change this if you want to try other models
learn = load_learner(modelFolder / importedModel)

In [None]:
#predictionTarget = '95%'

In [None]:
if predictionTarget != 'None':
    if predictionTarget == 'ALL':
        predictionTickers = tickers
    elif predictionTarget.endswith('%'):
        percentage = float(predictionTarget.strip('%')) / 100
        num_tickers = int(len(tickers) * percentage)
        predictionTickers = np.random.choice(tickers, size=num_tickers, replace=False).tolist()
    else:
        predictionTickers = [predictionTarget]

    # Fetch data for prediction tickers
    dfPrediction = pd.concat([getTickerData(ticker) for ticker in predictionTickers], ignore_index=True)

    # Ensure dfPrediction is a DataFrame
    if isinstance(dfPrediction, dict):
        dfPrediction = pd.DataFrame([dfPrediction])

    # Drop rows with NaN values in 'EV/EBIT' or 'ROIC'
    dfPrediction = dfPrediction.dropna(subset=['EV/EBIT', 'ROIC'])

    # Create test dataloader
    dl = learn.dls.test_dl(dfPrediction)
    dfPrediction.head()

    # Make predictions
    prediction = learn.get_preds(dl=dl)
    adr_df = pd.read_csv(testFolder / 'filteredTickers.csv')
    company_dict = dict(zip(adr_df['Ticker'], adr_df['Company']))

    if predictionTarget == 'ALL' or predictionTarget.endswith('%'):
        sorted_predictions = sorted(zip(predictionTickers, prediction[0]), key=lambda x: x[1], reverse=True)
        print(f"Got predictions for {len(sorted_predictions)} tickers, expected: {len(predictionTickers)}")
        print(f"Prediction for best performing tickers:")
        for symbol, pred in sorted_predictions:
            company_name = company_dict.get(symbol, 'Unknown')
            print(f"{symbol} ({company_name}): {pred[0].item() * 100:.2f}%")
    else:
        company_name = company_dict.get(predictionTarget, 'Unknown')
        print(f"Prediction for {predictionTarget} ({company_name}):")
        print(f"{prediction[0][0][0].item() * 100:.2f}%")
    print("Free money?!")

NameError: name 'getTickerData' is not defined