# Download Data from WRDS

In [1]:
!pip install wrds



In [2]:
import wrds
import pandas as pd
from datetime import datetime

In [None]:
# Connect to WRDS
db = wrds.Connection()

# Set date range
start_date = '2018-01-01'
end_date = '2023-12-31'

# CRSP Daily Stock Data (all columns)
crsp_query = f"""
    SELECT a.*, 
           b.gvkey, b.linktype, b.linkprim
    FROM crsp.dsf a
    LEFT JOIN crsp.ccmxpf_linktable b
    ON a.permno = b.lpermno
    AND b.linktype IN ('LC', 'LU')
    AND b.linkprim IN ('P', 'C')
    AND a.date BETWEEN b.linkdt AND COALESCE(b.linkenddt, '2023-12-31')
    WHERE a.date BETWEEN '{start_date}' AND '{end_date}'
"""
crsp_data = db.raw_sql(crsp_query)

# Compustat Fundamentals Quarterly (all columns)
compustat_query = f"""
    SELECT *
    FROM comp.fundq
    WHERE datadate BETWEEN '{start_date}' AND '{end_date}'
"""
compustat_data = db.raw_sql(compustat_query)

# Convert date columns to datetime
crsp_data['date'] = pd.to_datetime(crsp_data['date'])
compustat_data['datadate'] = pd.to_datetime(compustat_data['datadate'])

# Merge datasets
merged_data = pd.merge(crsp_data, compustat_data,
                       left_on=['gvkey', 'date'],
                       right_on=['gvkey', 'datadate'],
                       how='left', suffixes=('_crsp', '_compustat'))

# Forward fill Compustat data
merged_data = merged_data.sort_values(['permno', 'date'])
compustat_columns = [col for col in merged_data.columns if col.endswith('_compustat')]
merged_data[compustat_columns] = merged_data.groupby('permno')[compustat_columns].ffill()

# Save to CSV
merged_data.to_csv('crsp_compustat_merged_2018_2023.csv', index=False)

# Close the connection
db.close()

print("Data extraction and merging complete. File saved as CSV.")

# Print column names for reference
print("\nCRSP Columns:")
print(crsp_data.columns.tolist())
print("\nCompustat Columns:")
print(compustat_data.columns.tolist())

Enter your WRDS username [sagemaker-user]: jennasparks
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  n


You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


## Check the downloaded data

In [None]:
file_path = "/home/sagemaker-user/capstone-2024-summer/src/jenna/crsp_compustat_merged_2018_2023.csv"
# Read the CSV file with specified data types
df = pd.read_csv(file_path, low_memory=False)

In [None]:
df.head(10)

In [None]:
df.columns