# Econophysics and Sociophysics
Authors:

- Rofhiwa (Ralph) Matumba

## Model training

In [36]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
import yfinance as yf

from datetime import timedelta, datetime

### Data loading and visualisation

In this example, we are reading in stocks from an example company.

In [37]:
stock = 'SBK.JO'
data = yf.download(stock)

[*********************100%%**********************]  1 of 1 completed


Before we start using this data to make predictions from it, we will just display the last five data entries with column names so that we understand the structure of the data. As of downloading this dataset, the latest close was on the 1st of December 2023.

In [38]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-12-06,19899.0,19984.0,19676.0,19832.0,19832.0,1497648
2023-12-07,19656.0,19814.0,19296.0,19390.0,19390.0,2168879
2023-12-08,19390.0,19548.0,19121.0,19352.0,19352.0,2183120
2023-12-11,19401.0,19539.0,19168.0,19517.0,19517.0,1934383
2023-12-12,19755.0,19832.0,19474.0,19488.0,19488.0,294398


To be able to train models, we will need to look at trading models based on the day before. So we will add lag columns that contain relevant stock prices from the day before. We will also drop the 'Adj. Close' column because it is the same as the 'Close' column for the most part.

In [39]:
stonks = go.Figure(go.Candlestick(x=data.index,
                                   open=data['Open'],
                                   high=data['High'],
                                   low=data['Low'],
                                   close=data['Close'],
                                   
                                   name=f'{stock} Stock Chart'))

stonks.update_layout(title=f'{stock} Candlestick Chart',
                     xaxis_title='Date',
                     yaxis_title='Stock Price',
                     xaxis_rangeslider_visible=False)

stonks.show()

### Data preprocessing

In [40]:
data["Open_L"] = data["Open"].shift(1)
data["High_L"] = data["High"].shift(1)
data["Low_L"] = data["Low"].shift(1)
data["Close_L"] = data["Close"].shift(1)
data["Volume_L"] = data["Volume"].shift(1)

# Drop 'Adj Close' column and NaN columns
data = data.drop("Adj Close", axis=1)
data = data.dropna()

In [41]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Open_L,High_L,Low_L,Close_L,Volume_L
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-12-06,19899.0,19984.0,19676.0,19832.0,1497648,20000.0,19997.0,19638.0,19818.0,1351272.0
2023-12-07,19656.0,19814.0,19296.0,19390.0,2168879,19899.0,19984.0,19676.0,19832.0,1497648.0
2023-12-08,19390.0,19548.0,19121.0,19352.0,2183120,19656.0,19814.0,19296.0,19390.0,2168879.0
2023-12-11,19401.0,19539.0,19168.0,19517.0,1934383,19390.0,19548.0,19121.0,19352.0,2183120.0
2023-12-12,19755.0,19832.0,19474.0,19488.0,294398,19401.0,19539.0,19168.0,19517.0,1934383.0


### Data split

In [42]:
features = ["Open_L", "High_L", "Low_L", "Close_L", "Volume_L"]
targets = ["Open", "High", "Low", "Close", "Volume"]

X = data[features]
y = data[targets]

### Train-test split

In [43]:
def train_test_split(X, y, test_size):
    ind = int(len(y) - test_size * len(y))

    X_train, y_train = X[:ind], y[:ind]
    X_test, y_test = X[ind:], y[ind:]

    return X_train, y_train, X_test, y_test

In [44]:
test_size = 0.2
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=test_size)

### Model fitting

In [45]:
# Initialize a dictionary to store models and MSE
models = {}
mse_results = {}
pred = []

# Train separate models for each target variable in a for loop
for column in y.columns:

    model = LinearRegression()
    # Train the model
    model.fit(X_train, y_train[column])
    
    # Make predictions on the test set
    predictions = model.predict(X_test)
    pred.append(predictions)
    
    # Store the model and predictions
    models[column] = model
    mse_results[column] = mean_squared_error(y_test[column], predictions)

# Print the MSE for each target variable
for column, mse in mse_results.items():
    print(f'Mean Squared Error - {column}: {mse}')

pred = pd.DataFrame(np.array(pred).T)
pred.columns = targets
pred = pred.set_index(pd.Index(data.index[int(len(y) - test_size * len(y)):] + timedelta(days=1)))

Mean Squared Error - Open: 22405.91745268078
Mean Squared Error - High: 337437.21933616523
Mean Squared Error - Low: 53540.20312337732
Mean Squared Error - Close: 94361.20184017175
Mean Squared Error - Volume: 3789915622952.9976


## Model testing

In [46]:
stonks = go.Figure(go.Line(x=pred.index,
                                   y=pred['Close'],
                                   name=f'{stock} Predicted Stock Chart',
                                   line=dict(dash='dot'),
                                   line_color='black'))

stonks.add_trace(go.Candlestick(x=data.index[int(len(y) - test_size * len(y)):],
                                   open=y_test['Open'],
                                   high=y_test['High'],
                                   low=y_test['Low'],
                                   close=y_test['Close'],
                                   
                                   name=f'{stock} Actual Stock Chart'))

stonks.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


