# Stock Prediction Model

## Dependencies

### Library Installation (if needed)

In [30]:
# Install libraries the first time
#! pip install -U yfinance pandas pathlib numpy 

### Importing Required Libraries

In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## Data Preparation

In [32]:
symbols = pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()
symbols = pd.Series(symbols).unique()
symbols

array(['MATAS.CO', 'TRIFOR.CO', 'QQ.L', 'RNMBY', 'SAABF', 'BCKIY',
       'BAESY', 'IVSO.ST', 'NSKFF', 'GMAB', 'GN.CO', 'NVDA', 'LLY',
       'DANSKE.CO', 'CARL-B.CO', 'MAERSK-B.CO', 'RBREW.CO', 'ISS.CO',
       'DSV.CO', 'SCHO.CO', 'NETC.CO', 'JYSK.CO', 'ABBN.SW', 'TER',
       'PARKEN.CO', 'NFLX', 'TRMD-A.CO', 'STG.CO', 'NOVO-B.CO', 'EQNR',
       'NKT.CO', 'NSIS-B.CO', 'KCC.OL'], dtype=object)

#### Download and Process Historical Data

In [None]:
print_errors=True

df = pd.DataFrame()
for symbol in tqdm(symbols):
    ticker_df = pd.DataFrame()
    ticker = yf.Ticker(symbol)
    earning_dates = ticker.cash_flow.columns.tolist()
    price_data = yf.download(symbol, period='10y', rounding=False, progress=False)
    cash_flow = ticker.cash_flow
    cash_flow_columns = cash_flow[earning_dates[0]].keys().tolist()
    balance_sheet = ticker.balance_sheet
    balance_sheet_columns = balance_sheet[earning_dates[0]].keys().tolist()
    income_statement = ticker.income_stmt
    income_statement_columns = income_statement[earning_dates[0]].keys().tolist()

    for earning_date in earning_dates:
        try:
            earning_date_data = {'Ticker': symbol, 'Date': earning_date}
        
            got_price = False
            day_offset = 0
            while(got_price==False and day_offset > -6):
                try:
                    earning_date_data['1 Year Future Change'] = float(price_data.loc[earning_date + pd.Timedelta(days=day_offset+365,), ('Close', symbol)]) / float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)]) - 1
                    got_price = True
                except Exception:
                    day_offset += -1

            for column in cash_flow_columns:
                earning_date_data[column] = cash_flow[earning_date][column]

            for column in balance_sheet_columns:
                earning_date_data[column] = balance_sheet[earning_date][column]

            for column in income_statement_columns:
                earning_date_data[column] = income_statement[earning_date][column]
                
        except Exception as error:
            if print_errors:
                print(f'Error for {symbol}: {error}')
        
        if got_price:
            ticker_df = pd.concat([ticker_df, pd.DataFrame([earning_date_data])], ignore_index=True)

    if ticker_df.columns[ticker_df.isna().all()].tolist():
        continue
    imputer = SimpleImputer()
    wanted_columns = ticker_df.columns.drop(['Ticker', 'Date', '1 Year Future Change'])
    for column in wanted_columns:
        ticker_df[column] = imputer.fit_transform(ticker_df[[column]])

    df = pd.concat([df, ticker_df], ignore_index=True)

df.to_csv('../data/earnings_data.csv', index=False)

  0%|          | 0/33 [00:00<?, ?it/s]

 24%|██▍       | 8/33 [00:11<00:39,  1.59s/it]

Error for NSKFF: Timestamp('2020-12-31 00:00:00')


 30%|███       | 10/33 [00:14<00:35,  1.52s/it]

Error for GN.CO: Timestamp('2020-12-31 00:00:00')


 33%|███▎      | 11/33 [00:16<00:35,  1.59s/it]

Error for NVDA: Timestamp('2021-01-31 00:00:00')


 52%|█████▏    | 17/33 [00:26<00:25,  1.59s/it]

Error for ISS.CO: Timestamp('2020-12-31 00:00:00')


 61%|██████    | 20/33 [00:30<00:21,  1.64s/it]

Error for NETC.CO: Timestamp('2020-12-31 00:00:00')


 76%|███████▌  | 25/33 [00:38<00:12,  1.60s/it]

Error for NFLX: Timestamp('2020-12-31 00:00:00')


 85%|████████▍ | 28/33 [00:43<00:08,  1.67s/it]

Error for NOVO-B.CO: Timestamp('2020-12-31 00:00:00')


 91%|█████████ | 30/33 [00:47<00:04,  1.66s/it]

Error for NKT.CO: Timestamp('2020-12-31 00:00:00')


100%|██████████| 33/33 [00:52<00:00,  1.58s/it]


### Short visualisation

In [34]:
df

Unnamed: 0,Ticker,Date,1 Year Future Change,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,...,Non Current Accrued Expenses,Earnings From Equity Interest Net Of Tax,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Preferred Stock,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Preferred Stock Dividends
0,MATAS.CO,2024-03-31,0.153992,2.390000e+08,-2.100000e+07,-1.890000e+08,1.121000e+09,-4.060000e+08,1.310000e+08,3.700000e+07,...,,,,,,,,,,
1,MATAS.CO,2023-03-31,0.470333,4.240000e+08,0.000000e+00,-1.260000e+08,0.000000e+00,-2.540000e+08,3.700000e+07,2.800000e+07,...,,,,,,,,,,
2,MATAS.CO,2022-03-31,-0.103056,3.271000e+08,-7.510000e+07,-7.084000e+08,7.635000e+08,-1.834000e+08,2.820000e+07,4.070000e+07,...,,,,,,,,,,
3,MATAS.CO,2021-03-31,0.179800,8.036000e+08,0.000000e+00,-7.250000e+08,0.000000e+00,-1.484000e+08,4.070000e+07,1.066000e+08,...,,,,,,,,,,
4,QQ.L,2024-03-31,0.075779,1.468000e+08,-1.710000e+07,0.000000e+00,0.000000e+00,-9.630000e+07,2.310000e+08,1.512000e+08,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,NOVO-B.CO,2020-12-31,0.756239,6.058700e+10,-2.448567e+10,-7.259667e+09,1.112500e+10,-2.034467e+10,1.258800e+10,1.186600e+10,...,,,,,,,4.316667e+08,0.0,0.0,
76,NKT.CO,2023-12-31,0.117090,3.040000e+08,-7.000000e+06,-4.903333e+07,3.085000e+07,-2.380000e+08,8.880000e+08,2.620000e+08,...,,,,,,,,,,0.0
77,NKT.CO,2022-12-31,0.257814,1.071000e+08,-2.500000e+06,-8.250000e+07,6.170000e+07,-1.911000e+08,2.585000e+08,2.005000e+08,...,,,,,,,,,,0.0
78,NKT.CO,2021-12-31,0.240177,-4.600000e+06,0.000000e+00,-2.330000e+07,0.000000e+00,-2.134000e+08,2.005000e+08,2.392000e+08,...,,,,,,,,,,0.0


### Splitting and normalization

In [35]:
test_data = df.loc[df.groupby('Ticker')['Date'].idxmax()]
training_data = df[~df.index.isin(test_data.index)]

test_data.to_csv('../data/test_data.csv', index=False)
training_data.to_csv('../data/training_data.csv', index=False)

display(test_data)
display(training_data)

Unnamed: 0,Ticker,Date,1 Year Future Change,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,...,Non Current Accrued Expenses,Earnings From Equity Interest Net Of Tax,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Preferred Stock,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Preferred Stock Dividends
52,ABBN.SW,2023-12-31,0.364849,3520000000.0,-1258000000.0,-1567000000.0,2586000000.0,-770000000.0,3909000000.0,4174000000.0,...,,,,,,,,,,
12,BCKIY,2024-03-31,0.424874,172300000.0,-12500000.0,-13100000.0,0.0,-142400000.0,552600000.0,429500000.0,...,,,,,,,,,,
28,DANSKE.CO,2023-12-31,0.270911,140119000000.0,0.0,-23696000000.0,22425000000.0,-1381000000.0,365609000000.0,232531000000.0,...,,,,,,,,,,
40,DSV.CO,2023-12-31,0.314907,14083000000.0,-13997000000.0,-327000000.0,212000000.0,-2375000000.0,6452000000.0,10160000000.0,...,,,,,,,,,,
20,GMAB,2023-12-31,-0.342097,7004000000.0,-564000000.0,-91000000.0,,-376000000.0,14867000000.0,9893000000.0,...,,,,,,,,,,
36,ISS.CO,2023-12-31,0.051002,2673000000.0,0.0,0.0,0.0,-719000000.0,6093000000.0,5214000000.0,...,,,,,,,,,,
24,LLY,2023-12-31,0.357117,-3152000000.0,-750000000.0,0.0,3958500000.0,-7392100000.0,2818600000.0,2067000000.0,...,,,,,,,,,,
32,MAERSK-B.CO,2023-12-31,0.064355,5997000000.0,-3120000000.0,-660000000.0,845000000.0,-3646000000.0,6683000000.0,10038000000.0,...,,,,,,,,,,
0,MATAS.CO,2024-03-31,0.153992,239000000.0,-21000000.0,-189000000.0,1121000000.0,-406000000.0,131000000.0,37000000.0,...,,,,,,,,,,
48,NETC.CO,2023-12-31,0.496028,552200000.0,-32000000.0,-314300000.0,127500000.0,-206400000.0,448100000.0,336000000.0,...,,,,,,,,,,


Unnamed: 0,Ticker,Date,1 Year Future Change,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,...,Non Current Accrued Expenses,Earnings From Equity Interest Net Of Tax,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Preferred Stock,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Preferred Stock Dividends
1,MATAS.CO,2023-03-31,0.470333,424000000.0,0.0,-126000000.0,0.0,-254000000.0,37000000.0,28000000.0,...,,,,,,,,,,
2,MATAS.CO,2022-03-31,-0.103056,327100000.0,-75100000.0,-708400000.0,763500000.0,-183400000.0,28200000.0,40700000.0,...,,,,,,,,,,
3,MATAS.CO,2021-03-31,0.1798,803600000.0,0.0,-725000000.0,0.0,-148400000.0,40700000.0,106600000.0,...,,,,,,,,,,
5,QQ.L,2023-03-31,0.124649,97000000.0,-800000.0,-257900000.0,481100000.0,-109000000.0,151200000.0,248100000.0,...,,,,,,,,,,
6,QQ.L,2022-03-31,0.067229,104400000.0,-800000.0,0.0,0.0,-84300000.0,248100000.0,190100000.0,...,,,,,,,,,,
7,QQ.L,2021-03-31,-0.036492,102100000.0,-9000000.0,-85966670.0,160366700.0,-75900000.0,190100000.0,105800000.0,...,,,,,,,,,,
9,SAABF,2022-12-31,0.628164,2344000000.0,0.0,-1386000000.0,1394000000.0,-2310000000.0,2869000000.0,1701000000.0,...,,,,,,,,,,
10,SAABF,2021-12-31,0.897003,3193000000.0,-246000000.0,-2538000000.0,2248000000.0,-2520000000.0,1701000000.0,2273000000.0,...,,,,,,,,,,
11,SAABF,2020-12-31,0.041659,2820333000.0,-242000000.0,-1666333000.0,1630667000.0,-2789333000.0,2233000000.0,2281000000.0,...,,,,,,,,,,
13,BCKIY,2023-03-31,0.846253,136200000.0,0.0,-972800000.0,416600000.0,-125100000.0,429500000.0,756500000.0,...,,,,,,,,,,


In [36]:
X_train = training_data.drop(columns=['Ticker', 'Date', '1 Year Future Change'])
y_train = training_data['1 Year Future Change']
X_test = test_data.drop(columns=['Ticker', 'Date', '1 Year Future Change'])
y_test = test_data['1 Year Future Change']

display(X_train)
display(y_train)

Unnamed: 0,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,Effect Of Exchange Rate Changes,Changes In Cash,Financing Cash Flow,...,Non Current Accrued Expenses,Earnings From Equity Interest Net Of Tax,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Preferred Stock,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Preferred Stock Dividends
1,424000000.0,0.0,-126000000.0,0.0,-254000000.0,37000000.0,28000000.0,0.0,9000000.0,-413000000.0,...,,,,,,,,,,
2,327100000.0,-75100000.0,-708400000.0,763500000.0,-183400000.0,28200000.0,40700000.0,2000000.0,-12500000.0,-291400000.0,...,,,,,,,,,,
3,803600000.0,0.0,-725000000.0,0.0,-148400000.0,40700000.0,106600000.0,2000000.0,-65900000.0,-840000000.0,...,,,,,,,,,,
5,97000000.0,-800000.0,-257900000.0,481100000.0,-109000000.0,151200000.0,248100000.0,1800000.0,-98700000.0,159700000.0,...,,,,,,,,,,
6,104400000.0,-800000.0,0.0,0.0,-84300000.0,248100000.0,190100000.0,1300000.0,56700000.0,-50400000.0,...,,,,,,,,,,
7,102100000.0,-9000000.0,-85966670.0,160366700.0,-75900000.0,190100000.0,105800000.0,11500000.0,72800000.0,-55600000.0,...,,,,,,,,,,
9,2344000000.0,0.0,-1386000000.0,1394000000.0,-2310000000.0,2869000000.0,1701000000.0,130000000.0,1038000000.0,-1178000000.0,...,,,,,,,,,,
10,3193000000.0,-246000000.0,-2538000000.0,2248000000.0,-2520000000.0,1701000000.0,2273000000.0,57000000.0,-629000000.0,-1639000000.0,...,,,,,,,,,,
11,2820333000.0,-242000000.0,-1666333000.0,1630667000.0,-2789333000.0,2233000000.0,2281000000.0,43333330.0,-91333330.0,-1349000000.0,...,,,,,,,,,,
13,136200000.0,0.0,-972800000.0,416600000.0,-125100000.0,429500000.0,756500000.0,-5700000.0,-321300000.0,-666100000.0,...,,,,,,,,,,


1     0.470333
2    -0.103056
3     0.179800
5     0.124649
6     0.067229
7    -0.036492
9     0.628164
10    0.897003
11    0.041659
13    0.846253
14   -0.075000
15   -0.487180
17    0.453156
18    0.552653
19    0.959440
21   -0.251880
22    0.063221
23   -0.027054
25    0.603917
26    0.337565
27    0.660782
29    0.374533
30    0.237116
31    0.140855
33   -0.100496
34   -0.253455
35    0.767016
37   -0.115871
38    0.178400
39    0.185958
41    0.071625
42   -0.279177
43    0.508148
45    0.081298
46   -0.055432
47   -0.056061
49   -0.238352
50   -0.582399
51    0.133788
53    0.348640
54   -0.138713
55    0.497111
57    0.245785
58   -0.464103
59    0.368076
61    0.672438
62   -0.518241
63    0.114124
65    0.048279
66    2.879940
67    0.148889
69    0.014749
70   -0.061772
71    0.390665
73    0.502751
74    0.295327
75    0.756239
77    0.257814
78    0.240177
79    0.163717
Name: 1 Year Future Change, dtype: float64

## Model Training

In [37]:
model = MLPClassifier(
    solver='adam',
    alpha=1e-4,
    hidden_layer_sizes=(50, 50, 50),
    shuffle=False,
    random_state=42,
    tol=0.0001,
    verbose=True,
    max_iter=50,
    batch_size=1024,
    learning_rate='adaptive',
    n_iter_no_change=4
)
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Testing and benchmarking

## Predictions