# Stock Prediction Model

## Dependencies

### Library Installation (if needed)

In [463]:
# Install libraries the first time
#! pip install -U yfinance pandas pathlib numpy 

### Importing Required Libraries

In [464]:
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
import pathlib as Path
from tqdm import tqdm
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## Data Preparation

In [465]:
symbols = pd.read_csv('../data/simple_screener_results.csv')['Ticker'].tolist()
symbols = pd.Series(symbols).unique()
symbols

array(['MATAS.CO', 'TRIFOR.CO', 'QQ.L', 'RNMBY', 'SAABF', 'BCKIY',
       'BAESY', 'IVSO.ST', 'NSKFF', 'GMAB', 'GN.CO', 'NVDA', 'LLY',
       'DANSKE.CO', 'CARL-B.CO', 'MAERSK-B.CO', 'RBREW.CO', 'ISS.CO',
       'DSV.CO', 'SCHO.CO', 'NETC.CO', 'JYSK.CO', 'ABBN.SW', 'TER',
       'PARKEN.CO', 'NFLX', 'TRMD-A.CO', 'STG.CO', 'NOVO-B.CO', 'EQNR',
       'NKT.CO', 'NSIS-B.CO', 'KCC.OL'], dtype=object)

#### Download and Process Historical Data

In [466]:
#symbols = ['AAPL']

In [None]:
print_errors=True

df = pd.DataFrame()
for symbol in tqdm(symbols):
    ticker_df = pd.DataFrame()
    ticker = yf.Ticker(symbol)
    earning_dates = ticker.cash_flow.columns.tolist()
    price_data = yf.download(symbol, period='10y', rounding=False, progress=False)
    cash_flow = ticker.cash_flow
    cash_flow_columns = cash_flow[earning_dates[0]].keys().tolist()
    balance_sheet = ticker.balance_sheet
    balance_sheet_columns = balance_sheet[earning_dates[0]].keys().tolist()
    income_statement = ticker.income_stmt
    income_statement_columns = income_statement[earning_dates[0]].keys().tolist()

    for earning_date in earning_dates:
        try:
            current_ticker_data = {'Ticker': symbol, 'Date': earning_date}
        
            got_price = False
            day_offset = 0
            while(got_price==False and day_offset > -6):
                try:
                    current_ticker_data['1 Year Future Change'] = float(price_data.loc[earning_date + pd.Timedelta(days=day_offset+365,), ('Close', symbol)]) / float(price_data.loc[earning_date + pd.Timedelta(days=day_offset), ('Close', symbol)])
                    got_price = True
                except Exception:
                    day_offset += -1

            for column in cash_flow_columns:
                current_ticker_data[column] = cash_flow[earning_date][column]

            for column in balance_sheet_columns:
                current_ticker_data[column] = balance_sheet[earning_date][column]

            for column in income_statement_columns:
                current_ticker_data[column] = income_statement[earning_date][column]
                
        except Exception as error:
            if print_errors:
                print(f'Error for {symbol}: {error}')
        
        if got_price:
            ticker_df = pd.concat([ticker_df, pd.DataFrame([current_ticker_data])], ignore_index=True)

    for column in ticker_df.columns.drop(['Ticker', 'Date', '1 Year Future Change']):
        if ticker_df[column].isna().any():
            continue
        imputer = SimpleImputer(strategy='mean')
        ticker_df[column] = pd.Series(
            imputer.fit_transform(ticker_df[[column]]).flatten(),
            index=ticker_df.index
        )

    df = pd.concat([df, ticker_df], ignore_index=True)

df.to_csv('../data/earnings_data.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ticker_df[column].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ticker_df[column].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

KeyboardInterrupt: 

### Short visualisation

In [None]:
df

Unnamed: 0,Ticker,Date,1 Year Future Change,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,...,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Unrealized Gain Loss,Dueto Related Parties Current,Duefrom Related Parties Current,Preferred Stock Dividends
0,MATAS.CO,2024-03-31,1.153992,,,,,,,,...,,,,,,,,,,
1,MATAS.CO,2023-03-31,1.470333,,,,,,,,...,,,,,,,,,,
2,MATAS.CO,2022-03-31,0.896944,,,,,,,,...,,,,,,,,,,
3,MATAS.CO,2021-03-31,1.179800,,,,,,,,...,,,,,,,,,,
4,TRIFOR.CO,2023-12-31,0.714643,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,NSIS-B.CO,2020-12-31,1.557316,,,,,,,,...,,,,,,,,,,
126,KCC.OL,2023-12-31,0.871689,136156000.0,,-219511000.0,142112000.0,-12843000.0,68071000.0,64685000.0,...,,,,,,,,,,
127,KCC.OL,2022-12-31,1.573113,95645000.0,,-24049000.0,0.0,-10238000.0,64685000.0,51529000.0,...,,,,,,,,,,
128,KCC.OL,2021-12-31,1.437101,-73255000.0,0.0,-123041000.0,169000000.0,-119105000.0,51529000.0,65685000.0,...,,,,,,,,,,


### Splitting and normalization

In [None]:
test_data = df.loc[df.groupby('Ticker')['Date'].idxmax()]
training_data = df[~df.index.isin(test_data.index)]

test_data.to_csv('../data/test_data.csv', index=False)
training_data.to_csv('../data/training_data.csv', index=False)

display(test_data)
display(training_data)

Unnamed: 0,Ticker,Date,1 Year Future Change,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,...,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Unrealized Gain Loss,Dueto Related Parties Current,Duefrom Related Parties Current,Preferred Stock Dividends
86,ABBN.SW,2023-12-31,1.364849,3520000000.0,-1258000000.0,-1567000000.0,2586000000.0,-770000000.0,3909000000.0,4174000000.0,...,,,,,,,,,,
23,BAESY,2023-12-31,1.042759,2803000000.0,-561000000.0,0.0,162000000.0,-957000000.0,4067000000.0,3107000000.0,...,,,,,,,,,,
19,BCKIY,2024-03-31,1.424874,,,,,,,,...,,,,,,,,,,
55,CARL-B.CO,2023-12-31,0.845952,7364000000.0,-3200000000.0,-3725000000.0,15272000000.0,-4243000000.0,13382000000.0,8163000000.0,...,,,,,,,,,,
51,DANSKE.CO,2023-12-31,1.270911,140119000000.0,0.0,-23696000000.0,22425000000.0,-1381000000.0,365609000000.0,232531000000.0,...,,,,,,,,,,
71,DSV.CO,2023-12-31,1.314907,14083000000.0,-13997000000.0,-327000000.0,212000000.0,-2375000000.0,6452000000.0,10160000000.0,...,,,,,,,,,,
114,EQNR,2023-12-31,0.819928,14126000000.0,-5589000000.0,-4240000000.0,,-10575000000.0,9641000000.0,15579000000.0,...,,,,,,,310000000.0,3525000000.0,1527000000.0,
35,GMAB,2023-12-31,0.657903,7004000000.0,-564000000.0,-91000000.0,,-376000000.0,14867000000.0,9893000000.0,...,,,,,,,,,,
39,GN.CO,2023-12-31,0.781844,1206000000.0,,-3273000000.0,254000000.0,-1432000000.0,2162000000.0,990000000.0,...,,,,,,,,,,
67,ISS.CO,2023-12-31,1.051002,2673000000.0,0.0,0.0,0.0,-719000000.0,6093000000.0,5214000000.0,...,,,,,,,,,,


Unnamed: 0,Ticker,Date,1 Year Future Change,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,...,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Unrealized Gain Loss,Dueto Related Parties Current,Duefrom Related Parties Current,Preferred Stock Dividends
1,MATAS.CO,2023-03-31,1.470333,,,,,,,,...,,,,,,,,,,
2,MATAS.CO,2022-03-31,0.896944,,,,,,,,...,,,,,,,,,,
3,MATAS.CO,2021-03-31,1.179800,,,,,,,,...,,,,,,,,,,
5,TRIFOR.CO,2022-12-31,0.736962,,,,,,,,...,,,,,,,,,,
6,TRIFOR.CO,2021-12-31,0.475323,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,NSIS-B.CO,2021-12-31,0.663732,3.794401e+08,,-1.725094e+08,2.882776e+08,-1.667277e+08,1.294829e+08,1.587947e+08,...,,,,,,,,,,
125,NSIS-B.CO,2020-12-31,1.557316,,,,,,,,...,,,,,,,,,,
127,KCC.OL,2022-12-31,1.573113,9.564500e+07,,-2.404900e+07,0.000000e+00,-1.023800e+07,6.468500e+07,5.152900e+07,...,,,,,,,,,,
128,KCC.OL,2021-12-31,1.437101,-7.325500e+07,0.0,-1.230410e+08,1.690000e+08,-1.191050e+08,5.152900e+07,6.568500e+07,...,,,,,,,,,,


In [None]:
X_train = training_data.drop(columns=['Ticker', 'Date', '1 Year Future Change'])
y_train = training_data['1 Year Future Change']
X_test = test_data.drop(columns=['Ticker', 'Date', '1 Year Future Change'])
y_test = test_data['1 Year Future Change']

display(X_train)
display(y_train)

Unnamed: 0,Free Cash Flow,Repurchase Of Capital Stock,Repayment Of Debt,Issuance Of Debt,Capital Expenditure,End Cash Position,Beginning Cash Position,Effect Of Exchange Rate Changes,Changes In Cash,Financing Cash Flow,...,Depreciation Amortization Depletion Income Statement,Amortization Of Intangibles Income Statement,Non Current Note Receivables,Line Of Credit,Commercial Paper,Current Notes Payable,Unrealized Gain Loss,Dueto Related Parties Current,Duefrom Related Parties Current,Preferred Stock Dividends
1,,,,,,,,0.000000e+00,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,3.794401e+08,,-1.725094e+08,2.882776e+08,-1.667277e+08,1.294829e+08,1.587947e+08,7.126266e+06,-3.643808e+07,-2.132501e+08,...,,,,,,,,,,
125,,,,,,,,,,,...,,,,,,,,,,
127,9.564500e+07,,-2.404900e+07,0.000000e+00,-1.023800e+07,6.468500e+07,5.152900e+07,,1.315600e+07,-8.248900e+07,...,,,,,,,,,,
128,-7.325500e+07,0.0,-1.230410e+08,1.690000e+08,-1.191050e+08,5.152900e+07,6.568500e+07,-7.420000e+05,-1.341300e+07,4.625400e+07,...,,,,,,,,,,


1      1.470333
2      0.896944
3      1.179800
5      0.736962
6      0.475323
         ...   
124    0.663732
125    1.557316
127    1.573113
128    1.437101
129    1.409652
Name: 1 Year Future Change, Length: 97, dtype: float64

## Model Training

In [None]:
model = MLPClassifier(
    solver='adam',
    alpha=1e-4,
    hidden_layer_sizes=(50, 50, 50),
    shuffle=False,
    random_state=42,
    tol=0.0001,
    verbose=True,
    max_iter=50,
    batch_size=1024,
    learning_rate='adaptive',
    n_iter_no_change=4
)
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Testing and benchmarking

## Predictions