# Imports

In [1]:
import pandas as pd
import os
import numpy as np
import re
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output
import plotly.io as pio
import dash_bootstrap_components as dbc
from plotly.subplots import make_subplots
import random
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# from sklearn.metrics import mean_squared_error# deprecated
from sklearn.metrics import root_mean_squared_error,make_scorer# alternative


from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from datetime import timedelta



# Data Overview


- **Insider Trading Data:**
    - Total Rows: 1,322,820
    - Unique Symbols: 7,877

- **Stock Prices Data:**
    - Total Rows: 5,442,556
    - Unique Symbols: 7,163

- **Merged Data:**
    - Total Rows: 978,647
    - Unique Symbols: 4,450

- **Working Business Days (2014-2017):** 1,043




## Columns Description

- **Insider Trading Data Columns:**
    - `TRANS_DATE`, `TRANS_SHARES`, `TRANS_PRICEPERSHARE`, `SHRS_OWND_FOLWNG_TRANS`
    - `EQUITY_SWAP_INVOLVED`, `TRANS_TIMELINESS`, `TRANS_ACQUIRED_DISP_CD`, `DIRECT_INDIRECT_OWNERSHIP`
    - `FILING_DATE`, `PERIOD_OF_REPORT`
    - `ISSUERTRADINGSYMBOL` (same as `SYMBOL` in stock prices data)
    - `RPTOWNER_RELATIONSHIP` (e.g., ten percent owner, director, officer, etc.)

- **Stock Prices Data Columns:**
    - `Date`, `Open`, `High`, `Low`, `Close`, `Volume`, `SYMBOL`

## Scratchpad

- The columns such as `TRANS_DATE`, `TRANS_SHARES`, `TRANS_PRICEPERSHARE`, and `SHRS_OWND_FOLWNG_TRANS` can be used to relate insider trading transactions to stock prices.
- Flags like `EQUITY_SWAP_INVOLVED`, `TRANS_TIMELINESS`, `TRANS_ACQUIRED_DISP_CD`, and `DIRECT_INDIRECT_OWNERSHIP` provide additional context for each transaction.
- `FILING_DATE` and `PERIOD_OF_REPORT` can help in processing and predicting stock prices in relation to insider trading data.
- `RPTOWNER_RELATIONSHIP` can be used to analyze the effect of a person's role on insider trading transactions and their impact on stock prices.
- The insider trading data is naturally less than the stock prices data as not all companies have insider trading data.
- The merged data could be useful for predicting stock prices based on insider trading data, showing a direct daily relationship between insider trading data and stock prices.
- There will be many more data points in the stock prices that have no corresponding insider trading data, indicating an indirect relationship between insider trading data and stock prices.
- In our plot, we can first plot all stock prices and then color-code the points that have insider trading data versus those that don't.
- Some insiders stock price is zero! we need to handle that somehow.
- Predicting LOW, HIGH, CLOSE, Open is actually already great with regression and without using insiders trading.
- VOLUME predictions is a bit tricky and might be improved with insider trading data.

# Features Engineering/ Processing

In [None]:
# Define the folder paths
insider_transactions_path = os.path.join('..', 'data', 'interim', 'insider_transactions','interim_insider_transactions.csv')
stock_prices_path = os.path.join('..', 'data', 'interim', 'stock_prices','interim_stock_prices.csv')
merged_path = os.path.join('..', 'data', 'interim', 'merged_insider_transactions_stock_prices','interim_merged_insider_transactions_stock_prices.csv')
df_insider_transactions = pd.read_csv(insider_transactions_path)
df_stock_prices = pd.read_csv(stock_prices_path)
df_merged = pd.read_csv(merged_path)
# --------------------------------------------
def clean_data(df):
    df['RPTOWNER_RELATIONSHIP'] = df['RPTOWNER_RELATIONSHIP'].apply(lambda x: 'TenPercentOwner' if 'TenPercentOwner' in x else x)
    df['RPTOWNER_RELATIONSHIP'] = df['RPTOWNER_RELATIONSHIP'].apply(lambda x: 'Director' if 'Director' in x else x)
    df['RPTOWNER_RELATIONSHIP'] = df['RPTOWNER_RELATIONSHIP'].apply(lambda x: 'Officer' if 'Officer' in x else x)
    return df
# ------------------------------------------------------------------------------


In [3]:
# Load and clean the merged dataframe
df_merged = pd.read_csv(merged_path)
df_merged = clean_data(df_merged)
df_merged['TRANS_DATE'] = pd.to_datetime(df_merged['TRANS_DATE'])
# sort the dataframe by TRANS_DATE
df_merged = df_merged.sort_values(['TRANS_DATE'])
df_merged['TransactionValue'] = df_merged['TRANS_PRICEPERSHARE'] * df_merged['TRANS_SHARES']
# Precompute a set of (Date, Symbol) tuples for faster lookup
df_merged['Date_Symbol'] = df_merged['TRANS_DATE'].dt.strftime('%Y-%m-%d') + '_' + df_merged['SYMBOL']
date_symbol_set = set(df_merged['Date_Symbol'])
# ------------------------------------------------------------------------------

In [4]:
# Load and clean the insider transactions dataframe
df_insider_transactions = pd.read_csv(insider_transactions_path)
df_insider_transactions = clean_data(df_insider_transactions)
df_insider_transactions['TRANS_DATE'] = pd.to_datetime(df_insider_transactions['TRANS_DATE'])
# sort the dataframe by TRANS_DATE
df_insider_transactions = df_insider_transactions.sort_values(['TRANS_DATE'])
df_insider_transactions['TransactionValue'] = df_insider_transactions['TRANS_PRICEPERSHARE'] * df_insider_transactions['TRANS_SHARES']
# Vectorized check for 'Exists in Stock Prices'
df_insider_transactions['Date_Symbol'] = df_insider_transactions['TRANS_DATE'].dt.strftime('%Y-%m-%d') + '_' + df_insider_transactions['ISSUERTRADINGSYMBOL']
df_insider_transactions['Exists in Stock Prices'] = df_insider_transactions['Date_Symbol'].isin(date_symbol_set)
# rename ISSUERTRADINGSYMBOL to SYMBOL
df_insider_transactions.rename(columns={'ISSUERTRADINGSYMBOL': 'SYMBOL'}, inplace=True)
# Add lag features for 1, 3, and 7 days for TRANS_SHARES, TRANS_PRICEPERSHARE, and TransactionValue
# not actual last date 7 days ago but rather,last 7 data points
# Add lag features for 7 and 21 days for TRANS_SHARES and TRANS_PRICEPERSHARE per SYMBOL
lags = [7, 21]
for lag in lags:
    df_insider_transactions[f'TRANS_SHARES_Lag{lag}'] = df_insider_transactions.groupby('SYMBOL')['TRANS_SHARES'].shift(lag)
    df_insider_transactions[f'TRANS_PRICEPERSHARE_Lag{lag}'] = df_insider_transactions.groupby('SYMBOL')['TRANS_PRICEPERSHARE'].shift(lag)
# Add moving average features for 7-day and 21-day periods for TransactionValue per SYMBOL
moving_averages = [7, 21]
for window in moving_averages:
    df_insider_transactions[f'TransactionValue_MA{window}'] = (
        df_insider_transactions.groupby('SYMBOL')['TransactionValue']
        .transform(lambda x: x.rolling(window=window, min_periods=1).mean())
    )
# ------------------------------------------------------------------------------



In [None]:
# Load and clean the stock prices dataframe
df_stock_prices = pd.read_csv(stock_prices_path)
df_stock_prices.loc[df_stock_prices['Low'] < 0, 'Low'] = df_stock_prices.loc[df_stock_prices['Low'] < 0, 'Open']
df_stock_prices['Date'] = pd.to_datetime(df_stock_prices['Date'])
df_stock_prices = df_stock_prices.sort_values(['Date'])
df_stock_prices['DateNumeric'] = (df_stock_prices['Date'] - df_stock_prices['Date'].min()).dt.days
df_stock_prices['MeanTotalValue'] = df_stock_prices['Volume'] * df_stock_prices[['Low', 'High', 'Open', 'Close']].mean(axis=1)
# Add lag features for 1, 3, and 7 days for Open, Close, High, Low, and Volume
# Add lag features for 1, 3, and 7 days for Open, Close, High, Low, and Volume per symbol
lags = [1, 3, 7]
grouped = df_stock_prices.groupby('SYMBOL')
for lag in lags:
    df_stock_prices[f'Open_Lag{lag}'] = grouped['Open'].shift(lag)
    df_stock_prices[f'Close_Lag{lag}'] = grouped['Close'].shift(lag)
    df_stock_prices[f'High_Lag{lag}'] = grouped['High'].shift(lag)
    df_stock_prices[f'Low_Lag{lag}'] = grouped['Low'].shift(lag)
    df_stock_prices[f'Volume_Lag{lag}'] = grouped['Volume'].shift(lag)
# Add moving average features for 3-day and 7-day periods for Open, Close, High, Low, and Volume
# Add moving average features for 3-day and 7-day periods for Open, Close, High, Low, Volume, and MeanTotalValue per symbol
moving_averages = [3, 7]
for window in moving_averages:
    df_stock_prices[f'Open_MA{window}'] = grouped['Open'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
    df_stock_prices[f'Close_MA{window}'] = grouped['Close'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
    df_stock_prices[f'High_MA{window}'] = grouped['High'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
    df_stock_prices[f'Low_MA{window}'] = grouped['Low'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
    df_stock_prices[f'Volume_MA{window}'] = grouped['Volume'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
    df_stock_prices[f'MeanTotalValue_MA{window}'] = grouped['MeanTotalValue'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)

# Vectorized check for 'Exists in Insiders' ( much faster search than using .apply() )
df_stock_prices['Date_Symbol'] = df_stock_prices['Date'].dt.strftime('%Y-%m-%d') + '_' + df_stock_prices['SYMBOL']
df_stock_prices['Exists in Insiders'] = df_stock_prices['Date_Symbol'].isin(date_symbol_set)# date_symbol_set set that contains common dates && symbols in both insider_transactions and stock_prices


# Perform merge_asof with a 7-day tolerance
df_stock_prices = pd.merge_asof(
    df_stock_prices,
    df_merged[['TRANS_DATE', 'SYMBOL']],
    by='SYMBOL',
    left_on='Date',
    right_on='TRANS_DATE',
    direction='backward',
    tolerance=pd.Timedelta('7D')
)
# Create flag column
df_stock_prices['InsiderTransactionInLast7Days'] = df_stock_prices['TRANS_DATE'].notnull()
# Drop 'TRANS_DATE' column if not needed
df_stock_prices = df_stock_prices.drop(columns=['TRANS_DATE'])
# we do same thing for 21 days
df_stock_prices = pd.merge_asof(
    df_stock_prices,
    df_merged[['TRANS_DATE', 'SYMBOL']],
    by='SYMBOL',
    left_on='Date',
    right_on='TRANS_DATE',
    direction='backward',
    tolerance=pd.Timedelta('21D')
)
# Create flag column
df_stock_prices['InsiderTransactionInLast21Days'] = df_stock_prices['TRANS_DATE'].notnull()
# Drop 'TRANS_DATE' column
df_stock_prices = df_stock_prices.drop(columns=['TRANS_DATE'])
# find the row with same date and symbol in df_merged or the closest date 
# once the date is found, we use df_insider_transactions to get:
# and add the TransactionValue_MA7, TRANS_PRICEPERSHARE_Lag7,TRANS_SHARES_Lag7, TransactionValue_MA21, TRANS_PRICEPERSHARE_Lag21,TRANS_SHARES_Lag21
# Prepare df_insider_transactions with the required columns
insider_cols = [
    'SYMBOL',
    'TRANS_DATE',
    'TransactionValue_MA7', 'TRANS_PRICEPERSHARE_Lag7', 'TRANS_SHARES_Lag7',
    'TransactionValue_MA21', 'TRANS_PRICEPERSHARE_Lag21', 'TRANS_SHARES_Lag21'
]
# Perform merge_asof to get the nearest prior insider transaction for each stock
df_stock_prices = pd.merge_asof(
    df_stock_prices,
    df_insider_transactions[insider_cols],
    left_on='Date',
    right_on='TRANS_DATE',
    by='SYMBOL',
    direction='backward'
)
# Prefix the insider columns and drop the originals
insider_columns = [
    'TransactionValue_MA7', 'TRANS_PRICEPERSHARE_Lag7', 'TRANS_SHARES_Lag7',
    'TransactionValue_MA21', 'TRANS_PRICEPERSHARE_Lag21', 'TRANS_SHARES_Lag21'
]
for col in insider_columns:
    df_stock_prices['insider_' + col] = df_stock_prices[col]
    df_stock_prices.drop(columns=col, inplace=True)
# Drop 'TRANS_DATE' 
df_stock_prices.drop(columns='TRANS_DATE', inplace=True)
# ------------------------------------------------------------------------------


## Save in CSV 'processed'

In [7]:
# we now want to save the dataframes to csv files in the path ../data/processed
# df_insider_transactions in the subfolder insider_transactions
# df_stock_prices in the subfolder stock_prices
# df_merged in the subfolder merged_insider_transactions_stock_prices
# Create the folder paths if they do not exist
insider_transactions_folder = os.path.join('..', 'data', 'processed', 'insider_transactions')
stock_prices_folder = os.path.join('..', 'data', 'processed', 'stock_prices')
merged_folder = os.path.join('..', 'data', 'processed', 'merged_insider_transactions_stock_prices')
os.makedirs(insider_transactions_folder, exist_ok=True)
os.makedirs(stock_prices_folder, exist_ok=True)
os.makedirs(merged_folder, exist_ok=True)
# Save the dataframes to csv files
df_insider_transactions.to_csv(os.path.join(insider_transactions_folder, 'processed_insider_transactions.csv'), index=False)
df_stock_prices.to_csv(os.path.join(stock_prices_folder, 'processed_stock_prices.csv'), index=False)
df_merged.to_csv(os.path.join(merged_folder, 'processed_merged_insider_transactions_stock_prices.csv'), index=False)
# ------------------------------------------------------------------------------
