# Download Data from WRDS

## Set Up Env

In [12]:
import sys
import os

import sagemaker
from sagemaker import get_execution_role

# Add the parent directory to the sys.path
sys.path.insert(0, os.path.abspath('..'))

# Define IAM role
role = get_execution_role()
role

# Establish S3 bucket connection
import boto3
s3 = boto3.client('s3')
bucket = 'capstone-bucket-4-friends'

# Take a look at current dir
print(os.getcwd())

from file_utilities import s3_download

/home/sagemaker-user/capstone-2024-summer/src/jenna


In [1]:
!pip install wrds

Collecting wrds
  Using cached wrds-3.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pandas<2.3,>=2.2 (from wrds)
  Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting psycopg2-binary<2.10,>=2.9 (from wrds)
  Using cached psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting scipy<1.13,>=1.12 (from wrds)
  Using cached scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting sqlalchemy<2.1,>=2 (from wrds)
  Using cached SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Using cached wrds-3.2.0-py3-none-any.whl (13 kB)
Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
Using cached psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Using cached scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [2]:
# !pip install pyarrow

In [2]:
import wrds
import pandas as pd
from datetime import datetime

import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
sp500_df = pd.read_csv("/home/sagemaker-user/capstone-2024-summer/data/security_master.csv")

sp500_df.head()

Unnamed: 0,cusip,PERMNO,PERMCO,HSHRCD,DLSTCD,HTICK,HCOMNAM,HTSYMBOL,HNAICS,HPRIMEXC,...,NUMDEL,NUMNDI,BEGDAT,ENDDAT,BEGPRC,ENDPRC,BEGRET,ENDRET,BEGVOL,ENDVOL
0,00130H10,76712,10996,11,100,AES,A E S CORP,AES,221118,N,...,1,68,06/26/1991,12/29/2023,06/26/1991,12/29/2023,06/26/1991,12/29/2023,06/26/1991,12/29/2023
1,00206R10,66093,21645,11,100,T,A T & T INC,T,517312,N,...,1,0,02/16/1984,12/29/2023,02/16/1984,12/29/2023,02/16/1984,12/29/2023,02/16/1984,12/29/2023
2,00507V10,79678,12499,11,233,,ACTIVISION BLIZZARD INC,ATVI,513210,Q,...,1,1968,10/22/1993,10/12/2023,10/22/1993,10/12/2023,10/22/1993,10/12/2023,10/22/1993,10/12/2023
3,00724F10,75510,8476,11,100,ADBE,ADOBE INC,ADBE,511210,Q,...,1,2185,08/13/1986,12/29/2023,08/13/1986,12/29/2023,08/13/1986,12/29/2023,08/13/1986,12/29/2023
4,00971T10,87299,17300,11,100,AKAM,AKAMAI TECHNOLOGIES INC,AKAM,511210,Q,...,1,1722,10/29/1999,12/29/2023,10/29/1999,12/29/2023,10/29/1999,12/29/2023,10/29/1999,12/29/2023


In [4]:
sp500_permnos = tuple(sp500_df['PERMNO'].unique())
print(sp500_permnos[:5])

(76712, 66093, 79678, 75510, 87299)


## Download Joined CRSP and Compustat Data

In [5]:
# Connect to WRDS
db = wrds.Connection()

# Set date range
start_date = '2018-01-01'
end_date = '2023-12-31'

# Main query
main_query = f"""
SELECT a.*,
       c.*
FROM crsp.dsf a
LEFT JOIN crsp.ccmxpf_linktable b
ON a.permno = b.lpermno
AND b.linktype IN ('LC', 'LU')
AND b.linkprim IN ('P', 'C')
AND a.date BETWEEN b.linkdt AND COALESCE(b.linkenddt, '{end_date}')
LEFT JOIN LATERAL (
    SELECT *
    FROM comp.fundq c2
    WHERE c2.gvkey = b.gvkey
      AND c2.datadate <= a.date
      AND c2.datadate BETWEEN '{start_date}' AND '{end_date}'
    ORDER BY c2.datadate DESC
    LIMIT 1
) c ON TRUE
WHERE a.date BETWEEN '{start_date}' AND '{end_date}'
AND a.permno IN {sp500_permnos}
ORDER BY a.permno, a.date
"""

print("Executing main query...")
data = db.raw_sql(main_query)

print("Query executed. Saving to CSV...")

# Save to CSV
data.to_csv('sp500_crsp_compustat_merged_2018_2023.csv', index=False)

# Close the connection
db.close()

print("Data extraction and merging complete. File saved as CSV.")

# Print column names for reference
print("\nColumns in the merged dataset:")
print(data.columns)

Enter your WRDS username [sagemaker-user]: jennasparks
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  n


You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done
Executing main query...


  full_df = pd.concat([full_df, chunk])


Query executed. Saving to CSV...
Data extraction and merging complete. File saved as CSV.

Columns in the merged dataset:
Index(['cusip', 'permno', 'permco', 'issuno', 'hexcd', 'hsiccd', 'date',
       'bidlo', 'askhi', 'prc',
       ...
       'costat', 'fic', 'cshtrq', 'dvpspq', 'dvpsxq', 'mkvaltq', 'prccq',
       'prchq', 'prclq', 'adjex'],
      dtype='object', length=667)


## Check the downloaded data

In [6]:
file_path = "/home/sagemaker-user/capstone-2024-summer/src/jenna/sp500_crsp_compustat_merged_2018_2023.csv"
# Read the CSV file with specified data types
df = pd.read_csv(file_path, low_memory=False)

In [7]:
df.head(10)

Unnamed: 0,cusip,permno,permco,issuno,hexcd,hsiccd,date,bidlo,askhi,prc,...,costat,fic,cshtrq,dvpspq,dvpsxq,mkvaltq,prccq,prchq,prclq,adjex
0,68389X10,10104,8045,10536,1,7379,2018-01-02,46.17,47.8011,46.63,...,,,,,,,,,,
1,68389X10,10104,8045,10536,1,7379,2018-01-03,47.44,48.07,47.71,...,,,,,,,,,,
2,68389X10,10104,8045,10536,1,7379,2018-01-04,47.715,48.19,48.18,...,,,,,,,,,,
3,68389X10,10104,8045,10536,1,7379,2018-01-05,48.28,48.63,48.47,...,,,,,,,,,,
4,68389X10,10104,8045,10536,1,7379,2018-01-08,47.94,49.07,48.98,...,,,,,,,,,,
5,68389X10,10104,8045,10536,1,7379,2018-01-09,48.92,49.36,49.06,...,,,,,,,,,,
6,68389X10,10104,8045,10536,1,7379,2018-01-10,48.58,49.27,48.8,...,,,,,,,,,,
7,68389X10,10104,8045,10536,1,7379,2018-01-11,48.44,49.02,48.95,...,,,,,,,,,,
8,68389X10,10104,8045,10536,1,7379,2018-01-12,49.06,49.825,49.51,...,,,,,,,,,,
9,68389X10,10104,8045,10536,1,7379,2018-01-16,49.44,50.06,49.59,...,,,,,,,,,,


## Upload Raw Data to S3

In [9]:
s3.upload_file("/home/sagemaker-user/capstone-2024-summer/src/jenna/sp500_crsp_compustat_merged_2018_2023.csv", bucket, "CRSP/sp500_crsp_compustat_merged_2018_2023.csv")

In [13]:
reload_path = s3_download("CRSP/sp500_crsp_compustat_merged_2018_2023.csv")

In [16]:
reload = pd.read_csv("/home/sagemaker-user/capstone-2024-summer/src/jenna/sp500_crsp_compustat_merged_2018_2023.csv")
assert df.equals(reload)

reload.head()

  reload = pd.read_csv("/home/sagemaker-user/capstone-2024-summer/src/jenna/sp500_crsp_compustat_merged_2018_2023.csv")


AssertionError: 