# **Data Collection Notebook**

**Objective:**
Collect and preprocess raw data related to insider transactions and stock prices.
Fetch data from Kaggle and save it as raw data, inspect and save it.

**Inputs:**
- Raw TSV files: `NONDERIV_TRANS.tsv`, `SUBMISSION.tsv`, `REPORTING_OWNER.tsv`
- Stock price data files in `../data/raw/stock_prices/` directory
- Kaggle JSON file - the authentication token.

**Outputs:**
- Interim CSV files:
  - `interim_insider_transactions.csv`
  - `interim_stock_prices.csv`
  - `interim_merged_insider_transactions_stock_prices.csv`

---


# Step 1: Imports & Kaggle Endpoint 

In [None]:
import sys
import json
from pathlib import Path
import os
import pandas as pd
import numpy as np
import re
from dotenv import load_dotenv
import zipfile
import shutil
import subprocess

# Change working directory

* Access current directory and change to parent directory

In [None]:
current_dir = os.getcwd()
current_dir

os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm Current Directory

In [None]:
current_dir = os.getcwd()
current_dir

---

# Fetch data from Kaggle

### Setup Credintials

In [None]:
# Define the path to the .env file in the current directory (MarketPulseAnalytics)
env_path = os.path.join(os.getcwd(), '.env')

# Load environment variables from the .env file if it exists
if os.path.exists(env_path):
    load_dotenv(env_path)
    print(".env file loaded from:", env_path)
else:
    print("No .env file found in the current directory. Ensure environment variables are set in the hosting environment.")

# Access the environment variables
kaggle_username = os.getenv('KAGGLE_USERNAME')
kaggle_key = os.getenv('KAGGLE_KEY')

# Verify environment variables are set
if not kaggle_username or not kaggle_key:
    print("Warning: KAGGLE_USERNAME and/or KAGGLE_KEY environment variables are not set. Make sure they are configured in the production environment.")
else:
    print("Environment variables loaded successfully.")


Set the download paths

In [None]:
# Define paths
stock_prices_download_path = 'data/downloaded/zip_stock_prices/'
stock_prices_filename = "price-volume-data-for-all-us-stocks-etfs.zip"
stock_prices_unzip_path = 'data/raw/stock_prices/'

insider_transactions_download_path = 'data/downloaded/zip_insider_transactions/'
insider_transactions_filename = 'sec-insider-transactions.zip'
insider_transactions_unzip_path = 'data/raw/insider_transactions/'

Define the Kaggle datasets

In [None]:
# specify the dataset name
stock_prices_dataset = "borismarjanovic/price-volume-data-for-all-us-stocks-etfs"
insider_transactions_dataset = "osawani/sec-insider-transactions"

Download the dataset

In [None]:
# Function to download the dataset using Kaggle CLI
def download_dataset(dataset_name, download_path):
    # Create the Kaggle CLI command as a string
    command = f"kaggle datasets download -d {dataset_name} -p {download_path}"
    
    # Print the command for debugging purposes
    print(f"Running command: {command}")
    
    # Use os.system to run the command (works in a regular Python script)
    os.system(command)
    
    # Notify the user the download is complete
    print(f"Dataset {dataset_name} downloaded successfully to {download_path}")

# Function to check if the file exists in the folder and download it if it doesn't
def check_and_download_file(folder_path, filename, dataset_name):
    # Construct the full path to the file
    file_path = os.path.join(folder_path, filename)

    # Check if the file exists
    if os.path.exists(file_path):
        print(f"File {filename} already exists in {folder_path}")
        return True  # File exists
    else:
        print(f"File {filename} does NOT exist in {folder_path}. Downloading now...")
        download_dataset(dataset_name, folder_path)  # Download the dataset
        return False  # File does not exist, and download started

aa

In [None]:
# Function to unzip the Stock Prices dataset (only the 'Stocks' folder)


def unzip_stock_prices(zip_path, unzip_path, specific_folder='Stocks'):
    if not os.path.exists(zip_path):
        print(f"Error: {zip_path} does not exist. Please download the zip file first.")
        return

    # Get the directory of the zip file to extract the Stocks folder in the same location
    zip_dir = os.path.dirname(zip_path)
    
    print(f"Unzipping {zip_path} to {zip_dir}...")

    # Extract the Stocks folder to the same location as the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # List all files in the zip
        all_files = zip_ref.namelist()

        # Filter for files that start with the specific folder name (e.g., 'Stocks/')
        files_to_extract = [file for file in all_files if file.startswith(specific_folder)]
        
        # Extract the files
        for file in files_to_extract:
            zip_ref.extract(file, zip_dir)
    
    # Path to the extracted Stocks folder
    stocks_folder_path = os.path.join(zip_dir, specific_folder)

    # Ensure the Stocks folder exists before moving its contents
    if os.path.exists(stocks_folder_path):
        # Move all contents of the Stocks folder to the target directory
        for item in os.listdir(stocks_folder_path):
            source = os.path.join(stocks_folder_path, item)
            destination = os.path.join(unzip_path, item)
            
            # Move files or directories
            if os.path.isdir(source):
                shutil.move(source, destination)
            else:
                shutil.move(source, destination)
        
        # Delete the Stocks folder after moving its contents
        shutil.rmtree(stocks_folder_path)
        
        print(f"Moved contents from '{stocks_folder_path}' to '{unzip_path}' and deleted the folder.")
    else:
        print(f"Error: The folder '{specific_folder}' was not found in the zip file.")

    print(f"Unzipping complete. Files extracted and moved to {unzip_path}.")

# Function to unzip the Insider Transactions dataset (extract everything)
def unzip_insider_transactions(zip_path, unzip_path):
    if not os.path.exists(zip_path):
        print(f"Error: {zip_path} does not exist. Please download the zip file first.")
        return

    print(f"Unzipping {zip_path} to {unzip_path}...")

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract all files
        zip_ref.extractall(unzip_path)

    print(f"Unzipping complete. Files extracted to {unzip_path}.")

In [None]:
# 1. Check and download the Stock Prices dataset if the file doesn't exist
check_and_download_file(stock_prices_download_path, stock_prices_filename, stock_prices_dataset)


In [None]:

# 2. Check and download the Insider Transactions dataset if the file doesn't exist
check_and_download_file(insider_transactions_download_path, insider_transactions_filename, insider_transactions_dataset)


In [None]:

# 3. Unzip the Stock Prices dataset (only the 'Stocks' folder)
zip_stock_prices_path = os.path.join(stock_prices_download_path, stock_prices_filename)
unzip_stock_prices(zip_stock_prices_path, stock_prices_unzip_path)


In [None]:

# 4. Unzip the Insider Transactions dataset (all contents)
zip_insider_transactions_path = os.path.join(insider_transactions_download_path, insider_transactions_filename)
unzip_insider_transactions(zip_insider_transactions_path, insider_transactions_unzip_path)

print("Download and extraction complete!")

---

# Step 2: Insider Trading ETL

## 2a: NONDERIV_TRANS.TSV (from different quearters and years)

### read all files and concat

In [None]:
# Create a list of all the files
files = [os.path.join('data', 'raw', 'insider_transactions', f'{year}q{quarter}_form345', 'NONDERIV_TRANS.tsv')
         for year in range(2014, 2018) for quarter in range(1, 5)]
# Read all the files and store them in a list
dataframes = []
for file in files:
    if os.path.exists(file):
        try:
            temp = pd.read_csv(file, sep='\t', low_memory=False)
            dataframes.append(temp)
        except Exception as e:
            print(f'Error reading {file}: {e}')
    else:
        print(f'File {file} does not exist')
# Concatenate all DataFrames into one DataFrame
df = pd.concat(dataframes, ignore_index=True)

### remove unwanted columns

In [None]:
# Drop columns that are not needed (ensure columns exist)(either columns have so many missing values or they are not needed)
columns_to_drop = ['DIRECT_INDIRECT_OWNERSHIP_FN',
                   'NATURE_OF_OWNERSHIP',
                   'NATURE_OF_OWNERSHIP_FN',
                   'VALU_OWND_FOLWNG_TRANS',
                   'VALU_OWND_FOLWNG_TRANS_FN',                   
                   'SHRS_OWND_FOLWNG_TRANS_FN',
                   'TRANS_ACQUIRED_DISP_CD_FN',
                   'TRANS_PRICEPERSHARE_FN',
                   'TRANS_SHARES_FN',
                   'TRANS_TIMELINESS_FN',
                   'EQUITY_SWAP_TRANS_CD_FN',
                   'TRANS_CODE',
                   'TRANS_FORM_TYPE',
                   'DEEMED_EXECUTION_DATE_FN',
                   'DEEMED_EXECUTION_DATE',
                   'TRANS_DATE_FN',
                   'SECURITY_TITLE_FN',
                   'SECURITY_TITLE']
# Drop columns if they exist in the DataFrame
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

# Function to correct the year format
def correct_year_format(date_str):
    match = re.match(r'(\d{2}-\w{3}-00(\d{2}))', date_str)
    if match:
        corrected_year = date_str.replace('00', '20', 1)  # Replace the leading '00' with '20'
        return corrected_year
    return date_str

# Apply the function to the TRANS_DATE column
df['TRANS_DATE'] = df['TRANS_DATE'].apply(correct_year_format)



### adjust column values mapping

In [None]:

# for column EQUITY_SWAP_INVOLVED, 0 = false, 1 = true
df['EQUITY_SWAP_INVOLVED'] = df['EQUITY_SWAP_INVOLVED'].astype(str)
print(df['EQUITY_SWAP_INVOLVED'].unique())
# Map the column values to ensure consistent True/False representation
df['EQUITY_SWAP_INVOLVED'] = df['EQUITY_SWAP_INVOLVED'].replace({
    'false': 'False',
    '0': 'False',
    '1': 'True',
    'true': 'True',
    'False': 'False',
    'True': 'True'
})
# Convert the column to boolean type
df['EQUITY_SWAP_INVOLVED'] = df['EQUITY_SWAP_INVOLVED'].map({'True': True, 'False': False})
# Print unique values to confirm conversion
print(df['EQUITY_SWAP_INVOLVED'].unique())



In [None]:

# for column TRANS_TIMELINESS, E = early, L = late, O = on time
print(df['TRANS_TIMELINESS'].unique())
df['TRANS_TIMELINESS'] = df['TRANS_TIMELINESS'].replace(np.nan, 'O')
print(df['TRANS_TIMELINESS'].unique())


### remove rows where SHRS_OWND_FOLWING_TRANS is nan or TRANS_PRICEPERSHR is nan

In [None]:
# using the column SHRS_OWND_FOLWNG_TRANS and TRANS_PRICEPERSHARE we remove any rows where the value is NaN for either column
df = df.dropna(subset=['SHRS_OWND_FOLWNG_TRANS', 'TRANS_PRICEPERSHARE'])
print(df)

### print the dataframe summary

In [None]:
# TRANS_ACQUIRED_DISP_CD: A = acquired, D = disposed
# DIRECT_INDIRECT_OWNERSHIP: D = direct, I = indirect
# EQUITY_SWAP_INVOLVED: 0 = false, 1 = true

# Print DataFrame information
print(df.info())

## 2b: SUBMISSION.TSV (from different quearters and years)

### read all files and concat

In [None]:
# Create a list of all the files
files = [os.path.join('data', 'raw', 'insider_transactions', f'{year}q{quarter}_form345', 'SUBMISSION.tsv')
         for year in range(2014, 2018) for quarter in range(1, 5)]
# Read all the files and store them in a list
dataframes = []
for file in files:
    if os.path.exists(file):
        try:
            temp = pd.read_csv(file, sep='\t', low_memory=False)
            dataframes.append(temp)
        except Exception as e:
            print(f'Error reading {file}: {e}')
    else:
        print(f'File {file} does not exist')
# Concatenate all DataFrames into one DataFrame
df2 = pd.concat(dataframes, ignore_index=True)


### coulmns to keep

In [None]:
# We only keep columns: ACCESSION_NUMBER, FILING_DATE, PERIOD_OF_REPORT, ISSUERNAME, ISSUERTRADINGSYMBOL
columns_to_keep = ['ACCESSION_NUMBER', 'FILING_DATE', 'PERIOD_OF_REPORT', 'ISSUERNAME', 'ISSUERTRADINGSYMBOL']
# Drop columns that are not needed
df2.drop(columns=[col for col in df2.columns if col not in columns_to_keep], inplace=True)
# Print DataFrame information
print(df2.info())


In [None]:
# the same company name should have the same trading symbol
# if 'ISSUERTRADINGSYMBOL' is nan, we look at its corresponding ISSUERNAME value. 
# if the corresponding ISSUERNAME is not nan, we can use it to find other rows of the same ISSUERNAME where ISSUERTRADINGSYMBOL is not nan and fill the nan value with the non-nan value.

#  if ISSUERNAME is nan, we can't do anything about it. we will just leave it as nan and drop rows where ISSUERTRADINGSYMBOL is nan 

# Create a mapping of ISSUERNAME to ISSUERTRADINGSYMBOL for non-NaN trading symbols
issuer_symbol_map = df2.dropna(subset=['ISSUERTRADINGSYMBOL']).set_index('ISSUERNAME')['ISSUERTRADINGSYMBOL'].to_dict()

# Apply the mapping to fill NaN values in ISSUERTRADINGSYMBOL based on ISSUERNAME
df2['ISSUERTRADINGSYMBOL'] = df2.apply(
    lambda row: issuer_symbol_map.get(row['ISSUERNAME'], row['ISSUERTRADINGSYMBOL']) 
    if pd.isna(row['ISSUERTRADINGSYMBOL']) and pd.notna(row['ISSUERNAME']) else row['ISSUERTRADINGSYMBOL'],
    axis=1
)

# Drop rows where ISSUERTRADINGSYMBOL is still NaN
df2.dropna(subset=['ISSUERTRADINGSYMBOL'], inplace=True)

# Print DataFrame info to verify changes
# FILING_DATE is when the form was filed to the commission
# TRANS_DATE is when the transaction was executed
# declaration of intent to trade or smth like that means that PERIOD_OF_REPORT can be before or same data as TRANS_DATE
# while filing date is maybe not needed for predictions, the report period can be useful.
# we can check if the report period is done before transaction date, indicating clear intent to trade . (maybe we can use delta between the two dates as a feature)

print(df2.info())



## 2c: REPORTING_OWNER.tsv (from different quearters and years)    

### read all files and concat

In [None]:
# Create a list of all the files
files = [os.path.join('data', 'raw', 'insider_transactions', f'{year}q{quarter}_form345', 'REPORTINGOWNER.tsv')
         for year in range(2014, 2018) for quarter in range(1, 5)]
# Read all the files and store them in a list
dataframes = []
for file in files:
    if os.path.exists(file):
        try:
            temp = pd.read_csv(file, sep='\t', low_memory=False)
            dataframes.append(temp)
        except Exception as e:
            print(f'Error reading {file}: {e}')
    else:
        print(f'File {file} does not exist')
# Concatenate all DataFrames into one DataFrame
df3 = pd.concat(dataframes, ignore_index=True)



### remove unwanted columns

In [None]:
# only keep RPTOWNER_RELATIONSHIP and ACCESSION_NUMBER
columns_to_keep = ['RPTOWNER_RELATIONSHIP', 'ACCESSION_NUMBER']
# Drop columns that are not needed
df3.drop(columns=[col for col in df3.columns if col not in columns_to_keep], inplace=True)
#drop nan RPTOWNER_RELATIONSHIP
df3.dropna(subset=['RPTOWNER_RELATIONSHIP'], inplace=True)
# Print DataFrame information
print(df3.info())

## 2d: Joined NONDERIV_TRANS.tsv, SUBMISSION.tsv, REPORTING_OWNER.tsv

In [None]:
# df4= join df, df2, df3 on ACCESSION_NUMBER
df4 = df.merge(df2, on='ACCESSION_NUMBER').merge(df3, on='ACCESSION_NUMBER')
# Print DataFrame information
print(df4.info())
# TRANS_ACQUIRED_DISP_CD: A = acquired, D = disposed
# DIRECT_INDIRECT_OWNERSHIP: D = direct, I = indirect
# EQUITY_SWAP_INVOLVED: 0 = false, 1 = true
# for column TRANS_TIMELINESS, E = early, L = late, O = on time



# Step 3: Stock Prices ETL

## Read the files

In [None]:
# Date,Open,High,Low,Close,Volume,OpenInt
#  above are the columns in the stock data. We can ignore the OpenInt column as it is not needed.
# the folder structure is ../data/raw/stock_data/xxx.us.txt where xxx is the stock symbol.
# before the first '.' delimiter, we have the symbol name.
# After the second '.' delimiter, we have the country name (us in this case). 
#  Therefore, the Insider Trading data's ISSUERTRADINGSYMBOL column should match the stock symbol name. The country name here seems irrelevant since in the REPORTINGOWNER.tsv file, we have the country name but that country is like the address of reporting owner (person) and not the stock.
#  Therefore, we can ignore the country name in the stock data file name and just match the symbol name.

#  let's read all files and extract the symbol name from the file name and store it in a new column called 'SYMBOL'. We will remove OpenInt column as well.





# Create a list of all the files
files = [os.path.join('data', 'raw', 'stock_prices', filename) 
         for filename in os.listdir(os.path.join('data', 'raw', 'stock_prices')) 
         if filename.endswith('.txt')]
# Read all the files and store them in a list
dataframes = []
for file in files:
    #  extract symbol name from file name which is the string before the first '.' delimiter in the file name
    symbol = os.path.basename(file).split('.')[0]
    if os.path.exists(file):
        try:
            temp = pd.read_csv(file, sep=',', low_memory=False)
            # Add a new column 'SYMBOL' with the symbol name
            temp['SYMBOL'] = symbol
            # remove OpenInt column
            temp.drop(columns=['OpenInt'], inplace=True)
            # filter dates to be from 2014 till 2017 (inclusive and all months)
            temp = temp[temp['Date'].str.startswith('2014') | temp['Date'].str.startswith('2015') | temp['Date'].str.startswith('2016') | temp['Date'].str.startswith('2017')]
            dataframes.append(temp)
        except Exception as e:
            print(f'Error reading {file}: {e}')# as an example: Error reading ..\data\raw\stock_prices\accp.us.txt: No columns to parse from file (empty data file)

    else:
        print(f'File {file} does not exist')
# Concatenate all DataFrames into one DataFrame
df5 = pd.concat(dataframes, ignore_index=True)




# Step 4: Merging Insider Trading and Stock Prices

In [None]:
#  the stocks prices dataset has  Symbol and Date columns (Date and SYMBOL)
#  the insider trading data has the transaction date and the stock symbol name (TRANS_DATE and ISSUERTRADINGSYMBOL)TRANS_DATE HAS THE FORM 13-MAR-2014

# therefore, we can join the insider trading data with the stock prices data on the stock symbol name and the transaction date. dATE HAS THE FORM 2014-01-23    
# df4 is the insider trading data and df5 is the stock prices data

# Ensure both dataframes have symbol columns in the same case (e.g., uppercase)
df4['ISSUERTRADINGSYMBOL'] = df4['ISSUERTRADINGSYMBOL'].str.upper()
df5['SYMBOL'] = df5['SYMBOL'].str.upper()

# Convert TRANS_DATE to the same format as Date in df5
df4['TRANS_DATE'] = pd.to_datetime(df4['TRANS_DATE'], format='%d-%b-%Y').dt.strftime('%Y-%m-%d')

# Merge the insider trading data with the stock prices data on the stock symbol name and the transaction date
merged_df = pd.merge(df4, df5, left_on=['ISSUERTRADINGSYMBOL', 'TRANS_DATE'], right_on=['SYMBOL', 'Date'], how='inner')

# Print the merged DataFrame information
print(merged_df.info())


# Step 5: Saving Interim Data

In [None]:
# We have 1,322,820 million rows for all the insider trading data files.(7,877 unique symbols)
# We have 5,442,556 rows for the stocks price data files. (7,163 unique symbols)
# merging both based on the stock symbol name and the transaction date, we have 978,647 rows. (4,450 unique symbols)
# from 2014 to 2017, we have 1,043 working business days.

# naturally, the insider trading data is less than the stock prices data as not all companies have insider trading data.
# the merged data could be useful for predicting stock prices based on insider trading data.(direct daily relationship between insider trading data and stock prices)
# but there will be many more data points in the stock prices that have no corresponding insider trading data. (indirect relationship between insider trading data and stock prices).
# in our plot, we can first plot all stocks prices and then color-code the points that have insider trading data vs those that don't have insider trading data.


#  for now, let's save the df4,  to the folder path ../data/interim/insider_transactions
#  let's save the df5 to the folder path ../data/interim/stock_prices
# let's save the merged_df to the folder path ../data/interim/merged_insider_transactions_stock_prices
# we save using paths and os packages that work on all operating systems.
#  if the folders do not exist, we create them.

# Define the folder paths
insider_transactions_path = os.path.join('data', 'interim', 'insider_transactions')
stock_prices_path = os.path.join('data', 'interim', 'stock_prices')
merged_path = os.path.join('data', 'interim', 'merged_insider_transactions_stock_prices')

# Create directories if they do not exist
os.makedirs(insider_transactions_path, exist_ok=True)
os.makedirs(stock_prices_path, exist_ok=True)
os.makedirs(merged_path, exist_ok=True)

# Save the DataFrames to the respective paths
df4.to_csv(os.path.join(insider_transactions_path, 'interim_insider_transactions.csv'), index=False)
df5.to_csv(os.path.join(stock_prices_path, 'interim_stock_prices.csv'), index=False)
merged_df.to_csv(os.path.join(merged_path, 'interim_merged_insider_transactions_stock_prices.csv'), index=False)