#Kerry Back, Rice University, Original Source

##MBA candidate Ray Lee adds extension based on professor Kerry's handout

This code performs data processing and manipulation tasks on short interest data, which is essential for understanding market sentiment and trends. Short interest data reflects the number of shares of a security that have been sold short by investors but have not yet been covered or closed out.

Purpose:
The main purpose of this code is to retrieve, clean, and process short interest data for analysis. It involves downloading daily short interest data , cleaning the data, resampling it to a weekly frequency, and merging it with additional data for further analysis.

# Web Sraping from Finra

1. Installs required package pandas_market_calendars.
2. Imports necessary libraries including pandas, requests, and datetime.
3. Mounts Google Drive and sets the folder path.
4. Defines functions to download files for specific dates and convert them to CSV.
5. Retrieves NYSE trading calendar, sets start and end dates, and obtains trading days.
6. Downloads files for each trading day from a specific URL and saves them to Google Drive.
7. Converts downloaded text files to CSV format.

In [1]:
!pip install pandas_market_calendars

Collecting pandas_market_calendars
  Downloading pandas_market_calendars-4.4.0-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m61.4/106.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting exchange-calendars>=3.3 (from pandas_market_calendars)
  Downloading exchange_calendars-4.5.4-py3-none-any.whl (192 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.6/192.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyluach (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl (25 kB)
Collecting korean-lunar-calendar (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-no

In [2]:
from google.colab import drive
import requests
from pandas_market_calendars import get_calendar
from datetime import datetime, timedelta
import sys
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set your Google Drive folder path
folder_path = '/content/drive/MyDrive/MGMT767/Short_File'

# Add the folder_path to sys.path
sys.path.append(folder_path)

# Function to download file for a specific date
def download_file_for_date(date_str, folder_path):
    # First attempt with the original URL
    url = f"https://cdn.finra.org/equity/regsho/daily/CNMSshvol{date_str}.txt"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file to the Google Drive folder
        file_path = f"{folder_path}/shrt{date_str}.txt"
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"File saved successfully for date: {date_str}")
        return file_path
    else:
        print(f"Failed to download file for date {date_str}. Status code: {response.status_code}")
        # Retry with the modified URL


# Convert text files to CSV
def convert_to_csv(file_path):
    df = pd.read_csv(file_path, delimiter='|')  # Adjust delimiter if needed
    csv_file_path = file_path.replace('.txt', '.csv')
    df.to_csv(csv_file_path, index=False)
    print(f"File converted to CSV: {csv_file_path}")

# Get the NYSE trading calendar
nyse = get_calendar("XNYS")

# Set start and end dates
start_date = datetime(2024, 5, 15)
end_date = datetime(2024, 5, 17)

# Get trading days between start and end dates
trading_days = nyse.valid_days(start_date, end_date)

# Download files for each trading day and convert them to CSV
for date in trading_days:
    formatted_date = date.strftime("%Y%m%d")
    file_path = download_file_for_date(formatted_date, folder_path)
    if file_path:
        convert_to_csv(file_path)

Mounted at /content/drive
File saved successfully for date: 20240515
File converted to CSV: /content/drive/MyDrive/MGMT767/Short_File/shrt20240515.csv
File saved successfully for date: 20240516
File converted to CSV: /content/drive/MyDrive/MGMT767/Short_File/shrt20240516.csv
File saved successfully for date: 20240517
File converted to CSV: /content/drive/MyDrive/MGMT767/Short_File/shrt20240517.csv


1. Imports necessary libraries and mounts Google Drive.
2. Retrieves NYSE trading calendar, sets start and end dates.
3. Constructs a list of trading days and corresponding file names.
4. Combines specified CSV files into one DataFrame.
5. Saves the combined data to a new CSV file.
6. Prints a confirmation message and displays the last few rows of the combined DataFrame.

In [3]:
import pandas as pd
from google.colab import drive
import os
from pandas_market_calendars import get_calendar
from datetime import datetime, timedelta


# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set your Google Drive folder path
folder_path = '/content/drive/MyDrive/MGMT767/Short_File'

nyse = get_calendar("XNYS")

start_date = datetime(2024, 5, 15)
end_date = datetime(2024, 5, 17)

trading_days = nyse.valid_days(start_date, end_date)
trading_days = trading_days.strftime("%Y%m%d")
date_list = list(trading_days)

# List of filenames to be combined

file_names = ["total_short_April.csv"] + [f"shrt{date}.csv" for date in date_list]

#file_names = [f"shrt{date}.csv" for date in date_list]


# Initialize an empty DataFrame to store the combined data
total_short = pd.DataFrame()

# Read and combine the specified files
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    if os.path.exists(file_path): # Check if file exists
      try:
        df = pd.read_csv(file_path)
        total_short = pd.concat([total_short, df], ignore_index=True)
      except pd.errors.ParserError:
        print(f"Error reading file: {file_name}")
    else:
      print(f"File not found {file_name}")

# Write the combined data to a new CSV file
total_short.to_csv(folder_path + "/total_short_mid_May.csv", index=False)

print("Files combined and saved successfully.")

Mounted at /content/drive
Files combined and saved successfully.


In [4]:
total_short.tail()

Unnamed: 0,Date,Symbol,ShortVolume,ShortExemptVolume,TotalVolume,Market
24571281,20240517,ZWS,50270.0,31.0,70773.0,"Q,N"
24571282,20240517,ZYME,65805.0,22.0,90726.0,"Q,N"
24571283,20240517,ZYXI,10367.0,815.0,16894.0,"Q,N"
24571284,20240517,ZZZ,340.0,0.0,340.0,Q
24571285,9849,,,,,


In [None]:
from google.colab import drive
import requests
from pandas_market_calendars import get_calendar
from datetime import datetime, timedelta
import sys
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set your Google Drive folder path
folder_path = '/content/drive/MyDrive/MGMT767/Short_Interest'

# Add the folder_path to sys.path
sys.path.append(folder_path)

# Function to download file for a specific date
def download_file_for_date(date_str):
    url = f"https://cdn.finra.org/equity/otcmarket/biweekly/shrt{date_str}.csv"
    response = requests.get(url)
    if response.status_code == 200:
        # Save the file to the Google Drive folder
        file_path = f"{folder_path}/shrt{date_str}.csv"
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"File saved successfully for date: {date_str}")
        return file_path
    else:
        return None


def convert_to_csv(file_path):
    try:
        df = pd.read_csv(file_path, delimiter='|')  # Adjust delimiter if needed
        csv_file_path = file_path.replace('.csv', '_converted.csv')
        df.to_csv(csv_file_path, index=False)
        print(f"File converted to CSV: {csv_file_path}")
    except pd.errors.ParserError as e:
        print(f"Error converting file: {file_path}. ParserError: {e}")
    except Exception as e:
        print(f"Error converting file: {file_path}. Error: {e}")

# Get the NYSE trading calendar
nyse = get_calendar("XNYS")

# Set start and end dates
start_date = datetime(2024, 4, 20)
end_date = datetime(2024, 5, 15)


# Get trading days between start and end dates
trading_days = nyse.valid_days(start_date, end_date)

# Format dates and download files
file_paths = []
for date in trading_days:
    formatted_date = date.strftime("%Y%m%d")
    file_path = download_file_for_date(formatted_date)
    if file_path:
        file_paths.append(file_path)

# Convert text files to CSV
for file_path in file_paths:
    convert_to_csv(file_path)

Mounted at /content/drive
File saved successfully for date: 20240430
File converted to CSV: /content/drive/MyDrive/MGMT767/Short_Interest/shrt20240430_converted.csv


In [None]:
import pandas as pd
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set your Google Drive folder path
folder_path = '/content/drive/MyDrive/MGMT767/Short_Interest/'

# List all files in the specified folder_path
file_list = [file for file in os.listdir(folder_path) if file.endswith("converted.csv") and file.startswith("shrt")]


# Initialize an empty DataFrame to store the combined data
total_biweekly_short = pd.DataFrame()

# Read each CSV file and concatenate into the combined_data DataFrame
problematic_files = []
for file in file_list:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path)
        total_biweekly_short = pd.concat([total_biweekly_short, df], ignore_index=True)
    except pd.errors.ParserError:
        print(f"Error reading file: {file}")
        problematic_files.append(file)

print("Files combined and saved successfully.")

# Print filenames of problematic files
if problematic_files:
    print("Problematic files:")
    for file in problematic_files:
        print(file)

Mounted at /content/drive
Files combined and saved successfully.


In [None]:
total_biweekly_short = total_biweekly_short.fillna(" ")
total_biweekly_short.head()

Unnamed: 0,accountingYearMonthNumber,symbolCode,issueName,issuerServicesGroupExchangeCode,marketClassCode,currentShortPositionQuantity,previousShortPositionQuantity,stockSplitFlag,averageDailyVolumeQuantity,daysToCoverQuantity,revisionFlag,changePercent,changePreviousNumber,settlementDate
0,20180112,A,Agilent Technologies,A,NYSE,5137321,4197300,,1867541,2.75,,22.4,940021,2018-01-12
1,20180112,AA,Alcoa Corporation,A,NYSE,14308061,12689077,,3836979,3.73,,12.76,1618984,2018-01-12
2,20180112,AAALF,Aareal Bank AG AKT,S,OTC,16788,13823,,0,999.99,,21.45,2965,2018-01-12
3,20180112,AAAP,Advanced Accelerator Applicati,R,NNM,47097,108433,,251903,1.0,,-56.57,-61336,2018-01-12
4,20180112,AABA,"Altaba, Inc. Common Stock",R,NNM,23750001,23472187,,7991624,2.97,,1.18,277814,2018-01-12


In [None]:
total_biweekly_short.tail()

Unnamed: 0,accountingYearMonthNumber,symbolCode,issueName,issuerServicesGroupExchangeCode,marketClassCode,currentShortPositionQuantity,previousShortPositionQuantity,stockSplitFlag,averageDailyVolumeQuantity,daysToCoverQuantity,revisionFlag,changePercent,changePreviousNumber,settlementDate
2718657,20240430,ZYXI,"Zynex, Inc. Common Stock",R,NNM,5944879,5729744,,144773,41.06,,3.75,215135,2024-04-30
2718658,20240430,ZZCMF,ZHENGZHOU COAL MNG MACHY GROUP,S,OTC,20800,27800,,0,999.99,,-25.18,-7000,2024-04-30
2718659,20240430,ZZHGF,Zhongan Online PC Ins Co Ltd.,S,OTC,2172402,1641902,,11745,184.96,,32.31,530500,2024-04-30
2718660,20240430,ZZHGY,ZhongAn Online P & C Insurance,S,OTC,2,2,,5,1.0,,0.0,0,2024-04-30
2718661,20240430,ZZZ,ONEFUND LLC Cyber Hornet S&P 5,R,NNM,206,204,,853,1.0,,0.98,2,2024-04-30


In [None]:
# Save the combined data to a new CSV file
# We may not use this file in our model
folder_path = '/content/drive/MyDrive/MGMT767/Short_Interest'

total_biweekly_short.to_csv(os.path.join(folder_path, "total_biweekly_short_0430.csv"), index=False)

# SQL Database

Database tables

- tickers has one row for each ticker, with general company information
- indicators has one row for each variable in the other tables with definitions
- sf1 has annual and quarterly reports for all NYSE/Nasdaq stocks back to 2000
- sep has daily open, high, low, close and adjusted close for same stocks
- daily has marketcap, pb, pe, ps, ev, evebit, evebitda for same stocks
- sep_weekly is a weekly version of sep
- weekly is a weekly version of daily

Basic SQL

- select [] from [] join [] on [] where [] order by []
- select * means select all columns
- select top 3 * means select all columns for top 3 rows
- join [] on [] where [] order by [] are all optional
- a table that always exists in information_schema.tables.  It lists the other tables.

In [None]:
!pip install pymssql

Collecting pymssql
  Downloading pymssql-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymssql
Successfully installed pymssql-2.3.0


In [None]:
import pandas as pd

from sqlalchemy import create_engine
import pymssql
server = "*******"
database = "******"
username = "******"
password = "******"
string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + database
conn = create_engine(string).connect()

In [None]:
sf1 = pd.read_sql(
    """
    select ticker, datekey, lastupdated, netinc, ncfo, equity, assets
    from sf1
    where dimension='ARQ' and datekey>='2009-01-01' and equity>0 and assets>0
    order by ticker, datekey
    """,
    conn,
    parse_dates=["datekey"]
)
sf1 = sf1.groupby(["ticker", "datekey", "lastupdated"]).last()
sf1 = sf1.droplevel("lastupdated")
sf1 = sf1.reset_index()

In [None]:
sep_weekly = pd.read_sql(
    """
    select ticker, date, volume, closeadj, closeunadj, lastupdated
    from sep_weekly
    where date>='2010-01-01'
    order by ticker, date, lastupdated
    """,
    conn,
    parse_dates=["date"]
)
sep_weekly = sep_weekly.groupby(["ticker", "date", "lastupdated"]).last()
sep_weekly = sep_weekly.droplevel("lastupdated")

In [None]:
weekly = pd.read_sql(
    """
    select ticker, date, marketcap, pb, lastupdated
    from weekly
    where date>='2010-01-01' and marketcap>0 and pb>0
    order by ticker, date, lastupdated
    """,
    conn,
    parse_dates=["date"]
)
weekly = weekly.groupby(["ticker", "date", "lastupdated"]).last()
weekly = weekly.droplevel("lastupdated")
weekly = weekly.reset_index()

In [None]:
tickers = pd.read_sql(
    """
    select ticker, sector from tickers
    """,
    conn
)
tickers

In [None]:
from google.colab import drive
import sys
from joblib import load
import pandas as pd

drive.mount('/content/drive', force_remount=True)
folder_path = '/content/drive/MyDrive/MGMT767/Data/'
sys.path.append(folder_path)

In [None]:
sf1.to_csv(folder_path + "SF05172024.csv" )
sep_weekly.to_csv(folder_path + "SEP05172024.csv")
weekly.to_csv(folder_path + "WEEKLY05172024.csv")
tickers.to_csv(folder_path + "SECTOR05172024.csv")

# Data Organize

Calculate weekly returns and momentum

- Compute weekly return as closeadj.pct_change()
- Compute annual returns (through end of prior week)
- Compute monthly returns (through end of prior week)
- Momentum $= (1+\text{annual}) / (1+\text{monthly}) - 1$
- Momentum is through end of prior week so can be used to predict this week's returns
- Also, shift closeunadj by one week because we want to use it to filter out penny stocks.

In [5]:
from google.colab import drive
import sys
from joblib import load
import pandas as pd

drive.mount('/content/drive', force_remount=True)
folder_path = '/content/drive/MyDrive/MGMT767/Data/'
sys.path.append(folder_path)

Mounted at /content/drive


In [6]:
file_path_SF = folder_path + "SF05172024.csv"
file_path_SEP = folder_path + "SEP05172024.csv"
file_path_WEEKLY = folder_path + "WEEKLY05172024.csv"
file_path_SECTOR = folder_path + "SECTOR05172024.csv"
file_path_DATABASE = folder_path + 'database05172024.csv'

In [7]:
sf1 = pd.read_csv(file_path_SF)
sf1 = sf1.groupby(["ticker", "datekey", "lastupdated"]).last()
sf1 = sf1.droplevel("lastupdated")
sf1 = sf1.reset_index()

In [8]:
for col in ["netinc", "ncfo"]:
    sf1[col] = sf1.groupby("ticker", group_keys=False)[col].apply(
        lambda x: x.rolling(4).sum()
    )
for col in ["equity", "assets"]:
    sf1[col] = sf1.groupby("ticker", group_keys=False)[col].apply(
        lambda x: x.rolling(4).mean()
    )
sf1["roe"] = sf1.netinc / sf1.equity
sf1["accruals"] = (sf1.netinc - sf1.ncfo) / sf1.equity
sf1["agr"] = sf1.groupby("ticker", group_keys=False)["assets"].pct_change()
sf1 = sf1[["ticker", "datekey", "roe", "accruals", "agr"]].dropna()

In [9]:
sep_weekly = pd.read_csv(file_path_SEP)
sep_weekly = sep_weekly.groupby(["ticker", "date", "lastupdated"]).last()
sep_weekly = sep_weekly.droplevel("lastupdated")

In [10]:
sep_weekly["ret"] = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change()
sep_weekly["annual"] = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change(52)
sep_weekly["monthly"] = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change(4)
sep_weekly["mom"] = sep_weekly.groupby("ticker", group_keys=False).apply(
    lambda d: (1+d.annual)/(1+d.monthly) - 1
)
sep_weekly["volatility"] = sep_weekly.groupby("ticker", group_keys=False).ret.apply(
    lambda x: x.rolling(26).std()
)
sep_weekly = sep_weekly[["ret", "mom", "volume", "volatility", "closeadj", "closeunadj"]]
sep_weekly = sep_weekly.reset_index()

In [11]:
weekly = pd.read_csv(file_path_WEEKLY)
weekly = weekly.groupby(["ticker", "date", "lastupdated"]).last()
weekly = weekly.droplevel("lastupdated")
weekly = weekly.reset_index()

In [12]:
df = weekly.merge(sep_weekly, on=["ticker", "date"], how="inner")
df['date'] = pd.to_datetime(df['date'])
sf1['datekey'] = pd.to_datetime(sf1['datekey'])

df["year"] = df.date.apply(lambda x: x.isocalendar()[0])
df["week"] = df.date.apply(lambda x: x.isocalendar()[1])
sf1["year"] = sf1.datekey.apply(lambda x: x.isocalendar()[0])
sf1["week"] = sf1.datekey.apply(lambda x: x.isocalendar()[1])
df = df.merge(sf1, on=["ticker", "year", "week"], how="left")
df = df.drop(columns=["year", "week", "datekey"])

In [13]:
for col in ["roe", "accruals", "agr"]:
    df[col] = df.groupby("ticker", group_keys=False)[col].apply(
        lambda x: x.ffill()
    )

In [14]:
for col in ["pb", "mom", "volume", "volatility", "marketcap", "closeadj", "closeunadj"]:
    df[col] = df.groupby("ticker", group_keys=False)[col].shift()

In [15]:
tickers = pd.read_csv(file_path_SECTOR)

df = df.merge(tickers, on="ticker")

Compute market volatility
- Get daily market returns from French's data library
- Compute trailing 21 day standard deviation
- Downsample to weekly and merge with other data

In [16]:
#Market Volatility
import yfinance as yf
import numpy as np

price = yf.download("SPY", start="2010-01-01")["Adj Close"]
ret = price.pct_change()
vol = np.sqrt(252)*ret.rolling(21).std()
vol.name = "mktvol"
vol.index.name = "date"
vol = pd.DataFrame(vol).reset_index()
vol["year"] = vol.date.apply(lambda x: x.isocalendar()[0])
vol["week"] = vol.date.apply(lambda x: x.isocalendar()[1])
vol = vol.groupby(["year", "week"]).last()
vol = vol[["date", "mktvol"]].set_index("date")
vol["mktvol"] = vol.mktvol.shift()
vol = vol.dropna()
vol.head(3)

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,mktvol
date,Unnamed: 1_level_1
2010-02-12,0.192778
2010-02-19,0.198035
2010-02-26,0.199578


In [17]:
df = df.merge(vol, on="date", how="left")

In [18]:
df = df.ffill()
df.head()

Unnamed: 0,ticker,date,marketcap,pb,ret,mom,volume,volatility,closeadj,closeunadj,roe,accruals,agr,sector,mktvol
0,A,2010-01-01,,,,,,,,,,,,Healthcare,
1,A,2010-01-08,10604.5,4.2,-0.008716,,3129675.0,,20.192,31.07,,,,Healthcare,
2,A,2010-01-15,10918.4,4.4,-0.01199,,2585760.0,,20.016,30.8,,,,Healthcare,
3,A,2010-01-22,10751.0,4.3,-0.041414,,3127620.0,,19.776,30.43,,,,Healthcare,
4,A,2010-01-29,10744.0,4.3,-0.039088,,3307800.0,,18.957,29.17,,,,Healthcare,


In [19]:
df.to_csv(file_path_DATABASE, index=False)

In [20]:
import pandas as pd

folder_path = '/content/drive/MyDrive/MGMT767/Short_File/'
file_path = folder_path + "total_short_mid_May.csv"

total_short = pd.read_csv(file_path)
total_short = total_short.drop('Market', axis=1)

total_short.dropna()

# Rename the 'old_column_name' to 'new_column_name'
total_short.rename(columns={'Symbol': 'ticker'}, inplace=True)
total_short.rename(columns={'Date': 'date'}, inplace=True)

# Assuming you have already read the 'total_short' DataFrame from the CSV file
total_short['date'] = pd.to_datetime(total_short['date'], format='%Y%m%d', errors='coerce')


# Set the 'date' column as the index
total_short.set_index('date', inplace=True)

# Group by 'ticker' and resample on a weekly frequency ending on Fridays, calculating the mean
weekly_data = total_short.groupby('ticker').resample('W-Fri').mean()

# Reset the index to make 'ticker' and the date level columns again
weekly_data.reset_index(inplace=True)

In [21]:
weekly_data.tail()

Unnamed: 0,ticker,date,ShortVolume,ShortExemptVolume,TotalVolume
3144509,ZZZ,2024-04-19,28.8,0.0,35.0
3144510,ZZZ,2024-04-26,43.5,0.0,47.0
3144511,ZZZ,2024-05-03,,,
3144512,ZZZ,2024-05-10,,,
3144513,ZZZ,2024-05-17,176.0,0.0,178.5


In [22]:
weekly_data_1 =weekly_data.copy()
weekly_data_1.head()

Unnamed: 0,ticker,date,ShortVolume,ShortExemptVolume,TotalVolume
0,A,2018-08-03,111753.0,204.666667,554732.666667
1,A,2018-08-10,146847.4,232.8,477696.4
2,A,2018-08-17,303841.4,127.4,988129.0
3,A,2018-08-24,156824.0,33.6,666870.8
4,A,2018-08-31,125026.0,2.4,425225.8


In [23]:
df.index
df = df.reset_index()
df['date'] = pd.to_datetime(df['date'])

In [24]:
merged_df = pd.merge(df, weekly_data_1, on=['ticker', 'date'])
merged_df.tail()

Unnamed: 0,index,ticker,date,marketcap,pb,ret,mom,volume,volatility,closeadj,closeunadj,roe,accruals,agr,sector,mktvol,ShortVolume,ShortExemptVolume,TotalVolume
1132906,3169760,ZYXI,2024-04-19,383.1,8.3,-0.0445,0.051089,117266.8,0.065897,11.91,11.91,0.168182,-0.138665,0.038452,Healthcare,0.109365,26955.0,1076.8,44156.0
1132907,3169761,ZYXI,2024-04-26,366.1,7.9,-0.014938,0.067726,133455.2,0.064197,11.38,11.38,0.168182,-0.138665,0.038452,Healthcare,0.106969,28044.4,735.4,40218.6
1132908,3169762,ZYXI,2024-05-03,360.6,7.8,-0.018733,0.081294,136114.4,0.059843,11.21,11.21,0.163396,-0.194264,0.027482,Healthcare,0.123978,,,
1132909,3169763,ZYXI,2024-05-10,349.5,10.6,-0.046364,0.319104,166005.0,0.05757,11.0,11.0,0.163396,-0.194264,0.027482,Healthcare,0.141056,,,
1132910,3169764,ZYXI,2024-05-17,333.3,10.1,0.037178,0.249738,110905.4,0.058251,10.49,10.49,0.163396,-0.194264,0.027482,Healthcare,0.135276,11647.666667,1211.666667,23311.0


In [25]:
#Calculates the relative short volume by dividing the 'ShortVolume' column by the 'TotalVolume' column and assigns the result to a new column called 'relss'.
merged_df = merged_df.fillna(0)
merged_df['relss'] = merged_df['ShortVolume']/merged_df['TotalVolume']
merged_df = merged_df.fillna(0)
merged_df.head()

Unnamed: 0,index,ticker,date,marketcap,pb,ret,mom,volume,volatility,closeadj,closeunadj,roe,accruals,agr,sector,mktvol,ShortVolume,ShortExemptVolume,TotalVolume,relss
0,448,A,2018-08-03,20387.3,4.4,-0.000301,0.036068,2404016.2,0.039497,63.068,65.75,0.051039,-0.171638,0.022993,Healthcare,0.080746,111753.0,204.666667,554732.666667,0.201454
1,449,A,2018-08-10,20678.5,4.5,0.008073,0.045275,2188012.4,0.038481,63.049,65.73,0.051039,-0.171638,0.022993,Healthcare,0.081379,146847.4,232.8,477696.4,0.307407
2,450,A,2018-08-17,21280.0,4.6,-0.024151,0.077906,2255599.2,0.036668,63.558,66.26,0.051039,-0.171638,0.022993,Healthcare,0.066923,303841.4,127.4,988129.0,0.307492
3,451,A,2018-08-24,21097.6,4.6,0.020412,0.051867,3553459.0,0.033677,62.023,64.66,0.051039,-0.171638,0.022993,Healthcare,0.079415,156824.0,33.6,666870.8,0.235164
4,452,A,2018-08-31,20627.3,4.5,0.023638,0.054737,2272034.6,0.033859,63.289,65.98,0.064328,-0.152186,0.002575,Healthcare,0.074956,125026.0,2.4,425225.8,0.294023


In [27]:
folder_path = '/content/drive/MyDrive/MGMT767/merged_database_05172024.csv'

merged_df.to_csv(folder_path, index=False)