In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Data processing

In [None]:
# Dowload META stock data from yfinance
ticker = 'META'
data = yf.download(ticker, period='5y')
# Calculate the mean
data['Price'] = data[['Open', 'Close', 'High', 'Low']].mean(axis=1)

# Choose the necessary column
data = data[['Open', 'High', 'Low', 'Close', 'Volume', 'Price']]
data

In [None]:
# Check the lost data
print("Number of lost data:")
print(data.isna().sum())

# remove the row with lost data
data.dropna(inplace=True)

# Save data in CSV file
csv_file = f'{ticker}_historical_data.csv'
data.to_csv(csv_file)
print(f"Data saved in {csv_file}")

# Read CSV file (optional)
data_from_csv = pd.read_csv(csv_file, index_col=0)
print(data_from_csv.head())

# Prepare data for the model
prices = data_from_csv['Price'].values
volumes = data_from_csv['Volume'].values

# Split data into train and test sets
split_index = int(0.8 * len(prices)) # 80% of data for training, 20% for testing
train_prices, test_prices = prices[:split_index], prices[split_index:] # Train value contain 80% of the data, test is the 20% remaining
train_volumes, test_volumes = volumes[:split_index], volumes[split_index:] # Train value contain 80% of the data, test is the 20% remaining
train_data, test_data = data[:split_index], data[split_index:] # Train value contain 80% of the data, test is the 20% remaining




In [None]:
# Align the data
train_volumes = train_volumes[1:]   # Remove the last element of train_volumes
train_prices = train_prices[:-1]    # Remove the last element of train_prices
test_volumes = test_volumes[1:]     # Remove the last element of test_volumes
test_prices = test_prices[:-1]      # Remove the last element of test_prices

# Gradient Descent

In [None]:
import random

def predict (prices_for_predict, volumes_for_predict) :

    # Standardized data
    mean_price = np.mean(prices_for_predict) # Calculate mean
    std_price = np.std(prices_for_predict) # Calculate standard deviation
    prices_for_predict = (prices_for_predict - mean_price) / std_price
    
    # Initialize parameters
    b = random.random()
    a = random.random()
    alpha = 0.02  # Learning rate
    epochs = 100000  # Iteration
    # Number of samples
    m = len(volumes_for_predict)

    # Gradient Descent
    for epoch in range(epochs):
        volumes_pred = b + a * prices_for_predict # Compute predict volume
        error = volumes_pred - volumes_for_predict # Calculate the error
        db = (2/m) * np.sum(error) # Compute gradient
        da = (2/m) * np.sum(error * prices_for_predict) # Compute gradient
        
        # Check for the unsuitable ones
        if not np.isnan(db) and not np.isnan(da):
            b -= alpha * db
            a -= alpha * da
        else:
            print(f"Encountered NaN at epoch {epoch}")
            break
    # Calculate the finale predict
    volumes_pred = b + a * prices_for_predict
    
    print(f"Intercept: {b}\nSlope: {a}")
    return (volumes_pred, b, a)



# Calculate coefficient and metric base on test data

In [None]:
# Predict the volume
train_volumes_pred, train_incepter, train_slope = predict(train_prices, train_volumes)


## Calculate metric

In [None]:
# Calculate sum of squares regression
ssr = mean_squared_error(train_volumes, train_volumes_pred) * len(train_volumes)
print(f"Sum of squares regression: {ssr}")

# Using sklearn for comparing

In [None]:
# Reshape price into 2D for sklearn
train_prices_reshaped = train_prices.reshape(-1, 1)
test_prices_reshaped = test_prices.reshape(-1, 1)

In [None]:

# Initialze and training model LinearRegression of sklearn
model = LinearRegression()
model.fit(train_prices_reshaped, train_volumes)

# Predict the volume
train_volumes_pred_sklearn = model.predict(train_prices_reshaped)

# Compare regression coefficients
print(f"Intercept by sklearn: {model.intercept_}")
print(f"Slope by sklearn: {model.coef_[0]}")


# Calculate sum of squares regression
ssr_sklearn = mean_squared_error(train_volumes, train_volumes_pred_sklearn) * len(train_volumes)
print(f"Sum of square regression by sklearn: {ssr_sklearn}")



# The difference between the SSR value of the sklearn model and the Custom model

In [None]:
ssr_sklearn - ssr

# Comparison chart

In [None]:
plt.figure(figsize=(10, 6)) # Create a figure
plt.plot(train_data.index[1:], train_volumes, label='Actual') # Plot the actual trading volume
plt.plot(train_data.index[1:], train_volumes_pred, label='Predicted (Custom model)', color='y') # Plot the predicted volume
plt.plot(train_data.index[1:], train_volumes_pred_sklearn, label='Predicted (sklearn)', color='r', linestyle='--') # Plot the predicted volume using sklearn model
plt.xlabel('Day') # Set label of the x-axis
plt.ylabel('Volume') # Set label of the y-axis
plt.title(f'Predicted volume of {ticker}') # Set the title
plt.legend() # Add a legend
plt.show() # Display the figure

# Evaluation

In [None]:

# Calculating evaluation metrics
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Evalute metrics dictionary
metrics = {
    "Model": [],
    "R-Squared": [],
    "Multiple R": [],
    "Standard Error": [],
    "MSE": []
}

# Evaluate model function
def evaluate_model(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred) # measures the proportion
    mse = mean_squared_error(y_true, y_pred) # measures the average squared difference between the predicted and actual values
    std_error = np.sqrt(mse) # Standard error of the model predictions, calculated as the square root of MSE
    metrics["Model"].append(model_name)
    metrics["R-Squared"].append(r2)
    metrics["Multiple R"].append(np.sqrt(r2))
    metrics["Standard Error"].append(std_error)
    metrics["MSE"].append(mse)

model.fit(test_prices_reshaped, test_volumes) # Fit the model
test_volumes_pred, test_b, test_a = predict(test_prices, test_volumes) # Predict with function
test_volumes_pred_sklearn = model.predict(test_prices_reshaped) # Predict with sklearn model

# Evaluate model
evaluate_model(train_volumes, train_volumes_pred, "Custom Least Squares")
evaluate_model(train_volumes, train_volumes_pred_sklearn, "Sklearn LinearRegression")

# Display evaluation metrics
evaluation_df = pd.DataFrame(metrics) # Create DataFrame
print(evaluation_df)# Print DataFrame

plt.figure(figsize=(10, 6)) # Create a figure
plt.plot(test_data.index[1:], test_volumes, label='Actual') # Plot the actual volume
plt.plot(test_data.index[1:], test_volumes_pred, label='Predicted (Custom model)', color='r') # Plot the predicted volume
plt.plot(test_data.index[1:], test_volumes_pred_sklearn, label='Predicted (sklearn)', color='g', linestyle='--') # Plot the predicted volume using sklearn model
plt.xlabel('Day') # Set x-axis label
plt.ylabel('Volume') # Set y-axis label
plt.title(f'Predicted volume of {ticker}') # Set title
plt.legend() # Create a legend
plt.show() # Display the figure