# **Modelling Linear Regression**

**Objective:**
Build and evaluate a Linear Regression model to predict stock prices based on engineered features.

**Inputs:**
- Features from `2.0-Data-Features-Engineering.ipynb`
- Stock price targets

**Outputs:**
- Trained Linear Regression model
- Evaluation metrics and performance analysis

---

In [1]:
import os
current_dir = os.getcwd()
current_dir

os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


# Imports

In [2]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output
import plotly.io as pio
import dash_bootstrap_components as dbc
from plotly.subplots import make_subplots
import random
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# from sklearn.metrics import mean_squared_error# deprecated
from sklearn.metrics import root_mean_squared_error,make_scorer,mean_absolute_percentage_error,mean_absolute_error# alternative


from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from datetime import timedelta

def extract_date_features(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values(by='Date')
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    # return df.drop(columns='Date')
    return df


# Reading CSVs

In [3]:
# Define the folder paths
stock_prices_path = os.path.join('data', 'processed', 'stock_prices','processed_stock_prices.csv')
df_stock_prices = pd.read_csv(stock_prices_path)
df_stock_prices['Date'] = pd.to_datetime(df_stock_prices['Date'])


# Stocks Predictions using Linear Regression

In [4]:
# seperate the predictions logic from the dash app.
# the function below contains core logic for predicting stock prices as well as building blocks of plots

def predict_stock_prices(df_stock_prices, symbol, date_start, date_end,y_predicted_target_desired='Low'):
    colors_dark = dict(zip(['Open', 'High', 'Low', 'Close'], ['gray', 'magenta', 'darkblue', 'green']))
    colors_light = {
        'Open': 'lightgray',
        'High': 'lavenderblush',
        'Low': 'lightblue',
        'Close': 'lightgreen'
    }
    valid_targets = {
        'Low': ['Open', 'High', 'Close', 'Volume', 'year', 'month', 'day', 'day_of_week', 'is_weekend'],
        'High': ['Open', 'Low', 'Close', 'Volume', 'year', 'month', 'day', 'day_of_week', 'is_weekend'],
        'Open': ['High', 'Low', 'Close', 'Volume', 'year', 'month', 'day', 'day_of_week', 'is_weekend'],
        'Close': ['Open', 'High', 'Low', 'Volume', 'year', 'month', 'day', 'day_of_week', 'is_weekend'],
        'Volume': ['Open', 'High', 'Low', 'Close', 'year', 'month', 'day', 'day_of_week', 'is_weekend']
    }
    test_percentage_split=0.2
    # y_predicted_target_desired = 'Low' # can be any of the 4 targets: 'Low', 'High', 'Open', 'Close'
    metrics = {}
    # Create subplots layout
    fig = make_subplots(rows=3, cols=1, shared_xaxes=True, subplot_titles=('Stock Prices Predictions', y_predicted_target_desired+' vs Delta Volume Predictions/Actual','Features Importance for Volume Prediction'))
    fig.update_xaxes(showticklabels=True, row=1, col=1)

    dictionary_of_low_high_open_close_df_results = {'Low':None, 'High':None, 'Open':None, 'Close':None}
    








    
    # Process 'Low', 'High', 'Open', 'Close'
    for target in ['Low', 'High', 'Open', 'Close']:
        numerical_features = valid_targets[target]
        categorical_features = ['SYMBOL', 'Exists in Insiders']
        
        data = df_stock_prices[
            (df_stock_prices['SYMBOL'] == symbol) &
            (df_stock_prices['Date'] >= date_start) &
            (df_stock_prices['Date'] <= date_end)
        ].copy()
        data = extract_date_features(data)
        
        X = data.drop(target, axis=1)
        y = data[target]
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='mean')),
                    ('scaler', StandardScaler())
                ]), numerical_features),
                ('cat', Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                ]), categorical_features)
            ]
        )
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ])
        
        
 
        data = data.sort_values(by='Date')

        split_index = int(len(data) * (1 - test_percentage_split))

        # Custom split for training and testing based on sorted date
        X_train = X.iloc[:split_index]
        X_test = X.iloc[split_index:]
        y_train = y.iloc[:split_index]
        y_test = y.iloc[split_index:]
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percentage_split, random_state=0)
        # print("X_test Exists in Insiders count: ", X_test['Exists in Insiders'].value_counts())
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)
        
        root_mse = root_mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        metrics[target] = {'R²': r2, 'RMSE': root_mse}
        
        df_results = pd.DataFrame({
            'Date': X_test['year'].astype(str) + '-' + 
                    X_test['month'].astype(str).str.zfill(2) + '-' + 
                    X_test['day'].astype(str).str.zfill(2),
            'Actual': y_test,
            'Predicted': predictions,
            'Exists in Insiders': X_test['Exists in Insiders']
        }).reset_index(drop=True)
        df_results['Date'] = pd.to_datetime(df_results['Date'])
        df_results.sort_values('Date', inplace=True)
        fig.add_trace(
            go.Scatter(
                x=df_results['Date'],
                y=df_results['Actual'],
                mode='lines',
                name=f'Actual {target}',
                line=dict(color=colors_light[target], width=2),
                hovertemplate=f'Date: %{{x|%Y-%m-%d}}<br>Actual {target}: %{{y:.2f}}<extra></extra>'
            ),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Scatter(
                x=df_results['Date'],
                y=df_results['Predicted'],
                mode='lines',
                name=f'Predicted {target}',
                line=dict(color=colors_dark[target], width=2),
                hovertemplate=f'Date: %{{x|%Y-%m-%d}}<br>Predicted {target}: %{{y:.2f}}<extra></extra>'
            ),
            row=1, col=1
        )
        # add df_results to dictionary
        dictionary_of_low_high_open_close_df_results[target] = df_results
    
    annotation_text = " | ".join([f"{target} - R²: {metrics[target]['R²']:.2f}, RMSE: {metrics[target]['RMSE']:.2e}" for target in ['Low', 'High', 'Open', 'Close']])
    









    # Add the 'Volume' subplot
    data = df_stock_prices[
            (df_stock_prices['SYMBOL'] == symbol) &
            (df_stock_prices['Date'] >= date_start) &
            (df_stock_prices['Date'] <= date_end)
        ].copy()
    data = extract_date_features(data)
    target = 'Volume'
    numerical_features = valid_targets[target]
    categorical_features = ['SYMBOL','Exists in Insiders']

    X = data.drop(target, axis=1)
    y = data[target]
    preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='mean')),
                    ('scaler', StandardScaler())
                ]), numerical_features),
                ('cat', Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                ]), categorical_features)
            ]
        )
        
    pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ])
    data = data.sort_values(by='Date')
    split_index = int(len(data) * (1 - test_percentage_split))

    # Custom split for training and testing based on sorted date
    X_train = X.iloc[:split_index]
    X_test = X.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percentage_split, random_state=0)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)

    root_mse = root_mean_squared_error(y_test, predictions)
    mean_absolute_percentage_error_scorer = mean_absolute_percentage_error(y_test,predictions)

    r2 = r2_score(y_test, predictions)
    metrics[target] = {'R²': r2, 'RMSE': root_mse, 'MAPE': mean_absolute_percentage_error_scorer}

    df_results = pd.DataFrame({
        'Date': X_test['year'].astype(str) + '-' + 
                X_test['month'].astype(str).str.zfill(2) + '-' + 
                X_test['day'].astype(str).str.zfill(2),
        'Actual': y_test,
        'Predicted': predictions,
        'Exists in Insiders': X_test['Exists in Insiders']
    }).reset_index(drop=True)
    df_results['Date'] = pd.to_datetime(df_results['Date'])
    # print("X_test Exists in Insiders count: ", X_test['Exists in Insiders'].value_counts())
    df_results['Delta'] = abs(df_results['Actual'] - df_results['Predicted'])
    df_results.sort_values('Date', inplace=True)

    
    scatter_fig = px.scatter(
        df_results,
        x='Date',
        y=dictionary_of_low_high_open_close_df_results[y_predicted_target_desired]['Predicted'],
        size='Delta',
        hover_data={'Date': '|%Y-%m-%d', 'Actual': ':.2e', 'Predicted': ':.2e', 'Delta': ':.2e'},
        color='Exists in Insiders',
        color_discrete_map={True: 'green', False: 'red'},
        labels={'color': 'Insider Status'},
        category_orders={'Exists in Insiders': [True, False]},
    )
    scatter_fig.for_each_trace(lambda t: t.update(name=t.name.replace("True", "Exists in Insiders").replace("False", "Does not exist in Insiders")))
    # Extract the trace and add it to the existing figure
    for trace in scatter_fig.data:
        trace.hovertemplate = (
            f'Date: %{{x|%Y-%m-%d}}<br>'
            f'predicted_{y_predicted_target_desired} = %{{y:.2f}}<br>'
            f'Delta: %{{marker.size:.2e}}<extra></extra>'  # Ensure Delta is included in the hovertemplate
        )
        fig.add_trace(trace, row=2, col=1)
    
    # the third row of the subplot will be bars plot of the features importance used in predicting 'volume'
    # get the feature importance from the pipeline
    feature_importance = pipeline.named_steps['regressor'].coef_
    # get the feature names from the preprocessor
    feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)
    feature_names = np.concatenate([numerical_features, feature_names])
    # create a dataframe of the feature importance
    df_feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    # sort the dataframe by importance
    df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=False) 
    fig.add_trace(
    go.Bar(
        x=df_feature_importance['Feature'], 
        y=df_feature_importance['Importance'], 
        marker_color='blue',
        name='Feature Importance'  # Set the legend name here
    ), 
    row=3, col=1
    )

    
    


    
    annotation_text_volume_only = f"{y_predicted_target_desired} vs Volume - R²: {metrics['Volume']['R²']:.2e}, RMSE: {metrics['Volume']['RMSE']:.2e}, MAPE: {metrics['Volume']['MAPE']:.2e}"
    # let's calculate the average delta for all points when Exists in Insiders is True
    average_delta_true = df_results[df_results['Exists in Insiders'] == True]['Delta'].mean()
    # let's calculate the average delta for all points when Exists in Insiders is False
    average_delta_false = df_results[df_results['Exists in Insiders'] == False]['Delta'].mean()
    # let's calculate the average delta for all points
    average_delta = df_results['Delta'].mean()
    # before conccaenating the average delta to the annotation text, let's concat a new line break using <br>
    annotation_text_volume_only += "<br>"
    annotation_text_volume_only += f" Avg. Δ (With Insid.): {average_delta_true:.2e}"
    # annotation_text_volume_only += "<br>"
    annotation_text_volume_only += f" | Avg. Δ (No Insid.): {average_delta_false:.2e}"
    # annotation_text_volume_only += "<br>"
    annotation_text_volume_only += f" | Avg. Δ: {average_delta:.2e}"
    



    # Add annotations for the first and second subplots
    fig.add_annotation(
        text=annotation_text,
        xref="paper", yref="paper",
        x=0.5, y=1.08,  # Position near the title of the first subplot
        xanchor='center',
        showarrow=False,
        font=dict(size=12),
    )

    fig.add_annotation(
        text=annotation_text_volume_only,
        xref="paper", yref="paper",
        x=0.5, y=0.01,  # Position near the title of the second subplot
        xanchor='center',
        showarrow=False,
        font=dict(size=12)
    )

    # Layout and annotations
    fig.update_layout(
        # xaxis_title='Date',
        # yaxis_title='Value',
        height=800,
        # legend=dict(itemsizing='constant'),
        # hovermode='x unified'
    )
    # fig.show()
    
    return fig

# the function below will take the figure returned by the predict_stock_prices function and display it in the dash app

def launch_predictions_app():    
    # Initialize the Dash app with Bootstrap
    app = Dash('Stocks & Insiders Predictions App', external_stylesheets=[dbc.themes.BOOTSTRAP])

    # Define the app layout
    app.layout = html.Div([
        dbc.Container([
            html.H1("Stocks & Insiders Predictions", className='text-center mb-4'),
            dbc.Row([
                # Left Column: Stocks Layout
                dbc.Col([
                    html.H4("Stock Prices Controls"),
                    dbc.Row([
                        dbc.Col([
                            html.Label("Choose a symbol:"),
                            dcc.Dropdown(
                                id='symbol',
                                options=[{'label': i, 'value': i} for i in df_stock_prices['SYMBOL'].unique()],
                                value='AAPL',
                                clearable=False,
                                style={'backgroundColor': '#ffffff', 'color': 'black'}  
                            ),
                        ], width=6),
                        dbc.Col([
                            html.Label("Choose a column:"),
                            dcc.Dropdown(
                                id='column',
                                options=[
                                    {'label': 'Low', 'value': 'Low'},
                                    {'label': 'High', 'value': 'High'},
                                    {'label': 'Close', 'value': 'Close'},
                                    {'label': 'Open', 'value': 'Open'}
                                ],
                                value='Low',
                                clearable=False,
                                style={'backgroundColor': '#ffffff', 'color': 'black'}  
                            ),
                        ], width=6),
                    ]),
                    dbc.Row([
                        dbc.Col([
                            html.Label("Choose a date range:"),
                        ], width=2),
                        dbc.Col([
                            dcc.DatePickerRange(
                                id='date_range',
                                start_date=df_stock_prices[df_stock_prices['Date'].dt.year == 2014]['Date'].min(),
                                end_date=df_stock_prices[df_stock_prices['Date'].dt.year == 2014]['Date'].max(),
                                style={'backgroundColor': '#ffffff', 'color': 'black'} 
                            ),
                        ], width=4),
                        dbc.Col([
                            dcc.Checklist(
                                id='theme-toggle',
                                options=[
                                    {'label': 'Dark Mode', 'value': 'dark'}
                                ],
                                value=[]
                            ),
                        ], width=3)
                    ]),
                    dcc.Graph(id='stock_prices', config={'responsive': True})
                ], width=12, lg=6,style={'padding': '0px', 'width': '100%'}),
            ],justify='center',style={'padding': '0px', 'width': '100%'})
        ], fluid=True)
    ], id='main-div', style={'padding': '0px', 'width': '100%','backgroundColor': '#f8f9fa'})  # Light mode default

    # Define a function to style components based on theme
    def get_component_style(theme):
        if False:#'dark' in theme:
            return {
                'backgroundColor': '#2c2c2c',  # Dark background for dropdowns and date pickers
                'color': 'red',
                'border': '1px solid #444444',
            }
        else:
            return {
                'backgroundColor': '#E7E0E0',  # Light background for dropdowns and date pickers
                'color': 'black',
                'border': '1px solid #cccccc',
            }


    @app.callback(
        Output('stock_prices', 'figure'),
        Output('main-div', 'style'),
        Output('symbol', 'style'),  
        Output('column', 'style'),  
        Output('date_range', 'style'),  
        [Input('symbol', 'value'),
        Input('column', 'value'),
        Input('date_range', 'start_date'),
        Input('date_range', 'end_date'),
        Input('theme-toggle', 'value')]
    )


    def update_figure(symbol, column, start_date1, end_date1, theme):
    
        dropdown_style = get_component_style(theme)

        # Determine theme styles
        if 'dark' in theme:
            main_div_style = {'backgroundColor': '#2c2c2c', 'color': 'white'}
        else:
            main_div_style = {'backgroundColor': '#f8f9fa', 'color': 'black'}

        # Get styles for each component
        symbol_style = dropdown_style
        column_style = dropdown_style
        date_range_style = dropdown_style
        # only if date start and end are not valid we do the rest of the code else return  without doing anything
        if start_date1>end_date1:
            temp=start_date1
            start_date1=end_date1
            end_date1=temp
        fig_stock_prices = predict_stock_prices(df_stock_prices, symbol, start_date1, end_date1, column)
        fig_stock_prices.update_layout(
            plot_bgcolor=main_div_style['backgroundColor'],
            paper_bgcolor=main_div_style['backgroundColor'],
            font=dict(color=main_div_style['color'])
            )
        return fig_stock_prices,main_div_style, symbol_style, column_style, date_range_style
    # Run the app
    app.run_server(debug=True, port=32337)


launch_predictions_app()