In [19]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

RANDOM_STATE = 42

def normalize_data(file_path):
    df = pd.read_csv(file_path)

    # Convert Date column to datetime
    df['Date'] = pd.to_datetime(df['Date'])

    # Extract features that can be converted into a float for model training
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

    # Drop the old Date column
    df.drop('Date', axis=1, inplace=True)

    # Logarithmic Scaling for Volume
    df['Volume_log'] = np.log1p(df['Volume'])  # log1p handles zero values effectively

    # Min-Max Scaling for Stock Prices (Open, High, Low, Close, Adj Close)
    scaler = MinMaxScaler()
    price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']
    df[price_columns] = scaler.fit_transform(df[price_columns])

    return df

# MSE cost calculation function to measure model performance
def compute_cost(X, y, params):
    n_samples = len(y)
    h = X.dot(params)
    return (1/(2*n_samples)) * np.sum((h - y)**2)

def split_data(df):
    # adjusted closing price 
    X = df.drop(columns=['Close', 'Adj Close'])
    y = df['Adj Close']

    # Split the remaining data into training and test sets, standard .8 train with .2 test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    return X_train, X_test, y_train, y_test

def linear_regression_testing(X_train, X_test, y_train, y_test):
    # Use linear regression model to predict the relationship between date, open price, high price, low price, to the adjusted closing price
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    # Evaluate model performance
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    # Print results
    print(f"Linear Regressions Results:\n\tMean Squared Error (MSE): {mse}\n\tR² Score: {r2}")


# Load and normalize the dataset
file_path = './berkshire_hathaway_data.csv'
df = normalize_data(file_path)
X_train, X_test, y_train, y_test = split_data(df)
linear_regression_testing(X_train, X_test, y_train, y_test)




Linear Regressions Results:
	Mean Squared Error (MSE): 8.421714955934368e-06
	R² Score: 0.9998465921344927
