# Joining the data together for Stata analysis

1. Load Graph data

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import json

pd.set_option("display.max_columns", None)

Graph data is problematic - fix this as there are issues with sparseness, use a different graph format later

Load the data from the 1.5 notebook

In [29]:
boardex_graph_stats_df = pd.read_csv("final_data/boardex_graph_stats_gvkey.csv", index_col= 0)

In [32]:
boardex_graph_stats_df.head()

Unnamed: 0,permco,gvkey,companyid,score,preferred,duplicate,boardid,boardname,year,local_clustering_coef,degree_centrality,betweenness_centrality,graph_density
0,216,1602,2068.0,1.0,1.0,0.0,2068.0,AMGEN INC,2022,0.428352,0.001668,0.0009543488,0.003216
1,216,1602,2068.0,1.0,1.0,0.0,2068.0,AMGEN INC,2023,0.15665,0.002396,0.001115499,0.004495
2,682,18738,28834.0,1.0,1.0,0.0,28834.0,DOMINARI HOLDINGS INC (AIkido Pharma Inc prior...,2022,0.0,0.000852,6.180685e-07,0.003216
3,682,18738,28834.0,1.0,1.0,0.0,28834.0,DOMINARI HOLDINGS INC (AIkido Pharma Inc prior...,2023,0.0,0.000945,0.0,0.004495
4,887,12713,2179515.0,6.0,1.0,1.0,2179515.0,ABEONA THERAPEUTICS INC (PlasmaTech Biopharmac...,2022,0.0,0.000591,0.0,0.003216


In [33]:
boardex_graph_stats_df = boardex_graph_stats_df[[
    "gvkey", "year", "local_clustering_coef", "degree_centrality", "betweenness_centrality", "graph_density"
]]

In [34]:
boardex_graph_stats_df.head()

Unnamed: 0,gvkey,year,local_clustering_coef,degree_centrality,betweenness_centrality,graph_density
0,1602,2022,0.428352,0.001668,0.0009543488,0.003216
1,1602,2023,0.15665,0.002396,0.001115499,0.004495
2,18738,2022,0.0,0.000852,6.180685e-07,0.003216
3,18738,2023,0.0,0.000945,0.0,0.004495
4,12713,2022,0.0,0.000591,0.0,0.003216


2. Load the Text topic and sentiment data

In [2]:
# load the sentiment data
sentiment_10k_results_df = pd.read_json("../2.Initial_Graph_Building/annual_report_data/all_companies_report_annual_report_text_2022_2024_with_sentiments.json")


In [3]:
# load the topic data

topic_model_10k_results_df = pd.read_json("all_companies_metadata_2022_2024_text_1A_cleaned_w_topic_dist.json")

In [4]:
topic_model_10k_results_df.columns

Index(['id', 'accessionNo', 'cik', 'ticker', 'companyName', 'companyNameLong',
       'formType', 'description', 'filedAt', 'linkToTxt', 'linkToHtml',
       'linkToXbrl', 'linkToFilingDetails', 'entities', 'documentFormatFiles',
       'dataFiles', 'seriesAndClassesContractsInformation', 'periodOfReport',
       'Text_1', 'Text_1A', 'effectivenessDate', 'Text_1A_data_cleaned',
       'topic_distribution'],
      dtype='object')

In [5]:
sentiment_10k_results_df.columns

Index(['id', 'accessionNo', 'cik', 'ticker', 'companyName', 'companyNameLong',
       'formType', 'description', 'filedAt', 'linkToTxt', 'linkToHtml',
       'linkToXbrl', 'linkToFilingDetails', 'entities', 'documentFormatFiles',
       'dataFiles', 'seriesAndClassesContractsInformation', 'periodOfReport',
       'Text_1', 'Text_1A', 'effectivenessDate', 'risk_sentiment',
       'business_overview_sentiment'],
      dtype='object')

In [6]:
sentiment_10k_results_df.head()

Unnamed: 0,id,accessionNo,cik,ticker,companyName,companyNameLong,formType,description,filedAt,linkToTxt,linkToHtml,linkToXbrl,linkToFilingDetails,entities,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,periodOfReport,Text_1,Text_1A,effectivenessDate,risk_sentiment,business_overview_sentiment
0,c192751c88f50d8ca603ab72c5840583,0000318154-24-000011,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-02-14T16:23:32-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '17', 'description': 'XBRL TAXON...",[],2023-12-31,Item 1. BUSINESS ##TABLE_END\n\nAmgen Inc. (i...,Item 1A. RISK FACTORS ##TABLE_END\n\nThis rep...,,0,-1
1,3f8f2f776c43180b8573405b00605c15,0000318154-23-000017,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-09T16:26:31-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '11', 'description': 'XBRL TAXON...",[],2022-12-31,Item 1. BUSINESS ##TABLE_END\n\nAmgen Inc. (i...,Item 1A. RISK FACTORS ##TABLE_END\n\nThis rep...,,-1,-1
2,1d205d8c366bf6e51746f33967a1141d,0000318154-22-000010,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-02-16T16:39:53-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '13', 'description': 'XBRL TAXON...",[],2021-12-31,Item 1. BUSINESS ##TABLE_END\n\nAmgen Inc. (i...,Item 1A. RISK FACTORS ##TABLE_END\n\nThis rep...,,0,1
3,358cf2de3309caed349f7d4505bf6d49,0001628280-24-005397,9326,BCPC,BALCHEM CORP,BALCHEM CORP (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-02-16T17:26:27-05:00,https://www.sec.gov/Archives/edgar/data/9326/0...,https://www.sec.gov/Archives/edgar/data/9326/0...,,https://www.sec.gov/Archives/edgar/data/9326/0...,"[{'companyName': 'BALCHEM CORP (Filer)', 'cik'...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '14', 'description': 'XBRL TAXON...",[],2023-12-31,"Item 1. Business (All amounts in thousands, e...",Item 1A. Risk Factors \n\nWe discuss our expe...,,-1,-1
4,1941cf293846a69b64943f8b2b3ff87f,0001628280-23-005074,9326,BCPC,BALCHEM CORP,BALCHEM CORP (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-24T16:43:02-05:00,https://www.sec.gov/Archives/edgar/data/9326/0...,https://www.sec.gov/Archives/edgar/data/9326/0...,,https://www.sec.gov/Archives/edgar/data/9326/0...,"[{'companyName': 'BALCHEM CORP (Filer)', 'cik'...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '12', 'description': 'XBRL TAXON...",[],2022-12-31,"Item 1. Business (All amounts in thousands, e...",Item 1A. Risk Factors \n\nWe discuss our expe...,,-1,-1


In [7]:
# sentiment_10k_results_df.head()

# change to period of report later - that is the key field after looking at this

In [8]:
sentiment_10k_results_df.head()["description"][0]

'Form 10-K - Annual report [Section 13 and 15(d), not S-K Item 405]'

In [9]:
pd.to_datetime(sentiment_10k_results_df["periodOfReport"]).apply(lambda x : x.year)

0       2023
1       2022
2       2021
3       2023
4       2022
        ... 
2368    2023
2369    2022
2370    2021
2371    2023
2372    2022
Name: periodOfReport, Length: 2373, dtype: int64

In [10]:
annual_report_features_df = sentiment_10k_results_df.copy()

In [11]:
annual_report_features_df.drop(columns=['Text_1', 'Text_1A', ])

Unnamed: 0,id,accessionNo,cik,ticker,companyName,companyNameLong,formType,description,filedAt,linkToTxt,linkToHtml,linkToXbrl,linkToFilingDetails,entities,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,periodOfReport,effectivenessDate,risk_sentiment,business_overview_sentiment
0,c192751c88f50d8ca603ab72c5840583,0000318154-24-000011,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-02-14T16:23:32-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '17', 'description': 'XBRL TAXON...",[],2023-12-31,,0,-1
1,3f8f2f776c43180b8573405b00605c15,0000318154-23-000017,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-09T16:26:31-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '11', 'description': 'XBRL TAXON...",[],2022-12-31,,-1,-1
2,1d205d8c366bf6e51746f33967a1141d,0000318154-22-000010,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-02-16T16:39:53-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '13', 'description': 'XBRL TAXON...",[],2021-12-31,,0,1
3,358cf2de3309caed349f7d4505bf6d49,0001628280-24-005397,9326,BCPC,BALCHEM CORP,BALCHEM CORP (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-02-16T17:26:27-05:00,https://www.sec.gov/Archives/edgar/data/9326/0...,https://www.sec.gov/Archives/edgar/data/9326/0...,,https://www.sec.gov/Archives/edgar/data/9326/0...,"[{'companyName': 'BALCHEM CORP (Filer)', 'cik'...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '14', 'description': 'XBRL TAXON...",[],2023-12-31,,-1,-1
4,1941cf293846a69b64943f8b2b3ff87f,0001628280-23-005074,9326,BCPC,BALCHEM CORP,BALCHEM CORP (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-24T16:43:02-05:00,https://www.sec.gov/Archives/edgar/data/9326/0...,https://www.sec.gov/Archives/edgar/data/9326/0...,,https://www.sec.gov/Archives/edgar/data/9326/0...,"[{'companyName': 'BALCHEM CORP (Filer)', 'cik'...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '12', 'description': 'XBRL TAXON...",[],2022-12-31,,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2368,437a263932014ad06bcdbde870c7ae88,0001493152-24-009769,1857044,INDP,"Indaptus Therapeutics, Inc.","Indaptus Therapeutics, Inc. (Filer)",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-03-13T08:02:27-04:00,https://www.sec.gov/Archives/edgar/data/185704...,https://www.sec.gov/Archives/edgar/data/185704...,,https://www.sec.gov/Archives/edgar/data/185704...,"[{'companyName': 'Indaptus Therapeutics, Inc. ...","[{'sequence': '1', 'documentUrl': 'https://www...","[{'sequence': '16', 'description': 'XBRL SCHEM...",[],2023-12-31,,0,-1
2369,8751e66576529b1cda4f7faf672dbdc1,0001493152-23-008010,1857044,INDP,"Indaptus Therapeutics, Inc.","Indaptus Therapeutics, Inc. (Filer)",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-03-17T08:05:56-04:00,https://www.sec.gov/Archives/edgar/data/185704...,https://www.sec.gov/Archives/edgar/data/185704...,,https://www.sec.gov/Archives/edgar/data/185704...,"[{'companyName': 'Indaptus Therapeutics, Inc. ...","[{'sequence': '1', 'documentUrl': 'https://www...","[{'sequence': '18', 'description': 'XBRL SCHEM...",[],2022-12-31,,0,-1
2370,71661625315f74d4e79bb06e38c3d86b,0001493152-22-007319,1857044,INDP,"Indaptus Therapeutics, Inc.","Indaptus Therapeutics, Inc. (Filer)",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-03-21T07:05:43-04:00,https://www.sec.gov/Archives/edgar/data/185704...,https://www.sec.gov/Archives/edgar/data/185704...,,https://www.sec.gov/Archives/edgar/data/185704...,"[{'companyName': 'Indaptus Therapeutics, Inc. ...","[{'sequence': '1', 'documentUrl': 'https://www...","[{'sequence': '16', 'description': 'INLINE XBR...",[],2021-12-31,,0,-1
2371,b7522c86bfe011a281523375ad563f1d,0001410578-24-000198,1671502,QNRX,"Quoin Pharmaceuticals, Ltd.","Quoin Pharmaceuticals, Ltd. (Filer)",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-03-14T16:42:23-04:00,https://www.sec.gov/Archives/edgar/data/167150...,https://www.sec.gov/Archives/edgar/data/167150...,,https://www.sec.gov/Archives/edgar/data/167150...,"[{'companyName': 'Quoin Pharmaceuticals, Ltd. ...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '9', 'description': 'EX-101.SCH'...",[],2023-12-31,,0,-1


In [12]:
annual_report_features_df["report_year"] = pd.to_datetime(annual_report_features_df["periodOfReport"]).apply(lambda x : x.year)


In [14]:
# get topics
annual_report_features_df["risk_topic_0_allocation"] = topic_model_10k_results_df["topic_distribution"].apply(lambda x: x[0][-1])
annual_report_features_df["risk_topic_1_allocation"] = topic_model_10k_results_df["topic_distribution"].apply(lambda x: x[1][-1])
annual_report_features_df["risk_topic_2_allocation"] = topic_model_10k_results_df["topic_distribution"].apply(lambda x: x[2][-1])
annual_report_features_df["risk_topic_3_allocation"] = topic_model_10k_results_df["topic_distribution"].apply(lambda x: x[3][-1])

In [16]:
annual_report_features_df.head() # example for distribution approach - check result

Unnamed: 0,id,accessionNo,cik,ticker,companyName,companyNameLong,formType,description,filedAt,linkToTxt,linkToHtml,linkToXbrl,linkToFilingDetails,entities,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,periodOfReport,Text_1,Text_1A,effectivenessDate,risk_sentiment,business_overview_sentiment,report_year,risk_topic_0_allocation,risk_topic_1_allocation,risk_topic_2_allocation,risk_topic_3_allocation
0,c192751c88f50d8ca603ab72c5840583,0000318154-24-000011,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-02-14T16:23:32-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '17', 'description': 'XBRL TAXON...",[],2023-12-31,Item 1. BUSINESS ##TABLE_END\n\nAmgen Inc. (i...,Item 1A. RISK FACTORS ##TABLE_END\n\nThis rep...,,0,-1,2023,0.087753,0.905426,0.002893,0.003929
1,3f8f2f776c43180b8573405b00605c15,0000318154-23-000017,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-09T16:26:31-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '11', 'description': 'XBRL TAXON...",[],2022-12-31,Item 1. BUSINESS ##TABLE_END\n\nAmgen Inc. (i...,Item 1A. RISK FACTORS ##TABLE_END\n\nThis rep...,,-1,-1,2022,0.064385,0.926407,0.004403,0.004806
2,1d205d8c366bf6e51746f33967a1141d,0000318154-22-000010,318154,AMGN,AMGEN INC,AMGEN INC (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-02-16T16:39:53-05:00,https://www.sec.gov/Archives/edgar/data/318154...,https://www.sec.gov/Archives/edgar/data/318154...,,https://www.sec.gov/Archives/edgar/data/318154...,"[{'companyName': 'AMGEN INC (Filer)', 'cik': '...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '13', 'description': 'XBRL TAXON...",[],2021-12-31,Item 1. BUSINESS ##TABLE_END\n\nAmgen Inc. (i...,Item 1A. RISK FACTORS ##TABLE_END\n\nThis rep...,,0,1,2021,0.085579,0.904845,0.005602,0.003974
3,358cf2de3309caed349f7d4505bf6d49,0001628280-24-005397,9326,BCPC,BALCHEM CORP,BALCHEM CORP (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2024-02-16T17:26:27-05:00,https://www.sec.gov/Archives/edgar/data/9326/0...,https://www.sec.gov/Archives/edgar/data/9326/0...,,https://www.sec.gov/Archives/edgar/data/9326/0...,"[{'companyName': 'BALCHEM CORP (Filer)', 'cik'...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '14', 'description': 'XBRL TAXON...",[],2023-12-31,"Item 1. Business (All amounts in thousands, e...",Item 1A. Risk Factors \n\nWe discuss our expe...,,-1,-1,2023,0.6162,0.383668,6.6e-05,6.6e-05
4,1941cf293846a69b64943f8b2b3ff87f,0001628280-23-005074,9326,BCPC,BALCHEM CORP,BALCHEM CORP (Filer),10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-24T16:43:02-05:00,https://www.sec.gov/Archives/edgar/data/9326/0...,https://www.sec.gov/Archives/edgar/data/9326/0...,,https://www.sec.gov/Archives/edgar/data/9326/0...,"[{'companyName': 'BALCHEM CORP (Filer)', 'cik'...","[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '12', 'description': 'XBRL TAXON...",[],2022-12-31,"Item 1. Business (All amounts in thousands, e...",Item 1A. Risk Factors \n\nWe discuss our expe...,,-1,-1,2022,0.626234,0.373639,6.4e-05,6.4e-05


In [19]:
annual_report_key_features = annual_report_features_df[["cik", "report_year", 
                           "risk_sentiment", "business_overview_sentiment",
                           "risk_topic_0_allocation", "risk_topic_1_allocation", 
                           "risk_topic_2_allocation", "risk_topic_3_allocation"]]

In [21]:
annual_report_key_features.to_json("final_data/annual_reports_combined_features_2021_2024.json")

In [20]:
annual_report_key_features.head()

Unnamed: 0,cik,report_year,risk_sentiment,business_overview_sentiment,risk_topic_0_allocation,risk_topic_1_allocation,risk_topic_2_allocation,risk_topic_3_allocation
0,318154,2023,0,-1,0.087753,0.905426,0.002893,0.003929
1,318154,2022,-1,-1,0.064385,0.926407,0.004403,0.004806
2,318154,2021,0,1,0.085579,0.904845,0.005602,0.003974
3,9326,2023,-1,-1,0.6162,0.383668,6.6e-05,6.6e-05
4,9326,2022,-1,-1,0.626234,0.373639,6.4e-05,6.4e-05


#### Link GVKEY using the data dictionary

Same as boardex principles, but with another table, and then combine these together!!

2.Initial_Graph_Building/sec_gvkey_cik_us_pharma_2022_2024.csv

Match compustat with boardex and annual report date here.

In [23]:
# get year company details, 

ar_key_df = pd.read_csv("../2.Initial_Graph_Building/sec_gvkey_cik_us_pharma_2022_2024.csv", index_col = 0)

ar_key_df.head()

Unnamed: 0,gvkey,conm,datadate1,datadate2,cik,source,coname,fndate,lndate,n10k,n10k_nt,n10k_a,n10q,n10q_nt,n10q_a,ndef,n8k,ntot,flag
0,1602,AMGEN INC,1983-03-31,2023-12-31,318154.0,COMPN,AMGEN INC,1994-03-28,2024-06-04,32.0,0.0,3.0,95.0,0.0,4.0,92.0,308.0,2936.0,3.0
1,1979,BALCHEM CORP -CL B,1974-12-31,2023-12-31,9326.0,COMPN,BALCHEM CORP,1995-08-11,2024-05-24,31.0,0.0,2.0,91.0,0.0,5.0,38.0,194.0,1148.0,3.0
2,2086,BAXTER INTERNATIONAL INC,1950-12-31,2023-12-31,10456.0,COMPN,BAXTER INTERNATIONAL INC,1994-03-21,2024-06-05,34.0,1.0,3.0,97.0,1.0,5.0,52.0,276.0,2321.0,3.0
3,2403,BRISTOL-MYERS SQUIBB CO,1950-12-31,2023-12-31,14272.0,COMPN,BRISTOL MYERS SQUIBB CO,1994-03-15,2024-05-09,33.0,0.0,3.0,98.0,1.0,6.0,64.0,373.0,3304.0,3.0
4,2990,IMUNON INC,1983-09-30,2023-12-31,749647.0,COMPN,"IMUNON, INC.",1996-08-07,2024-05-22,38.0,1.0,9.0,96.0,3.0,9.0,41.0,300.0,1263.0,3.0


In [25]:
annual_report_key_features_final_df = pd.merge(ar_key_df[["gvkey", "cik"]], annual_report_key_features, how = "inner", on = "cik")

In [26]:
annual_report_key_features_final_df.to_csv("../4. Feature_Generation/final_data/sec_gvkey_ar_report_features.csv")

### Finally link with the compustat data

First review graph and company level data

In [35]:
boardex_graph_stats_df.head()

Unnamed: 0,gvkey,year,local_clustering_coef,degree_centrality,betweenness_centrality,graph_density
0,1602,2022,0.428352,0.001668,0.0009543488,0.003216
1,1602,2023,0.15665,0.002396,0.001115499,0.004495
2,18738,2022,0.0,0.000852,6.180685e-07,0.003216
3,18738,2023,0.0,0.000945,0.0,0.004495
4,12713,2022,0.0,0.000591,0.0,0.003216


In [36]:
annual_report_key_features_final_df.head()

Unnamed: 0,gvkey,cik,report_year,risk_sentiment,business_overview_sentiment,risk_topic_0_allocation,risk_topic_1_allocation,risk_topic_2_allocation,risk_topic_3_allocation
0,1602,318154.0,2023,0,-1,0.087753,0.905426,0.002893,0.003929
1,1602,318154.0,2022,-1,-1,0.064385,0.926407,0.004403,0.004806
2,1602,318154.0,2021,0,1,0.085579,0.904845,0.005602,0.003974
3,1979,9326.0,2023,-1,-1,0.6162,0.383668,6.6e-05,6.6e-05
4,1979,9326.0,2022,-1,-1,0.626234,0.373639,6.4e-05,6.4e-05


Start merging the results with compustat, just merge on year and company id (gvkey - compustat id)

In [38]:
graph_ar_features = pd.merge(boardex_graph_stats_df, annual_report_key_features_final_df, 
                             how = "inner", left_on= ["gvkey", "year"], right_on=["gvkey", "report_year"])
graph_ar_features.head()

Unnamed: 0,gvkey,year,local_clustering_coef,degree_centrality,betweenness_centrality,graph_density,cik,report_year,risk_sentiment,business_overview_sentiment,risk_topic_0_allocation,risk_topic_1_allocation,risk_topic_2_allocation,risk_topic_3_allocation
0,1602,2022,0.428352,0.001668,0.0009543488,0.003216,318154.0,2022,-1,-1,0.064385,0.926407,0.004403,0.004806
1,1602,2023,0.15665,0.002396,0.001115499,0.004495,318154.0,2023,0,-1,0.087753,0.905426,0.002893,0.003929
2,18738,2022,0.0,0.000852,6.180685e-07,0.003216,12239.0,2022,-1,-1,0.97147,0.008822,0.019662,4.6e-05
3,18738,2023,0.0,0.000945,0.0,0.004495,12239.0,2023,-1,-1,0.959382,0.009551,0.031023,4.4e-05
4,12713,2022,0.0,0.000591,0.0,0.003216,318306.0,2022,-1,-1,0.25,0.25,0.25,0.25


In [50]:
graph_ar_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gvkey                        1177 non-null   int64  
 1   year                         1177 non-null   int64  
 2   local_clustering_coef        1177 non-null   float64
 3   degree_centrality            1177 non-null   float64
 4   betweenness_centrality       1177 non-null   float64
 5   graph_density                1177 non-null   float64
 6   cik                          1177 non-null   float64
 7   report_year                  1177 non-null   int64  
 8   risk_sentiment               1177 non-null   int64  
 9   business_overview_sentiment  1177 non-null   int64  
 10  risk_topic_0_allocation      1177 non-null   float64
 11  risk_topic_1_allocation      1177 non-null   float64
 12  risk_topic_2_allocation      1177 non-null   float64
 13  risk_topic_3_alloc

In [57]:
# get compustat data

compustat_workflow_df = pd.read_csv("../2.Initial_Graph_Building/compustat_pharma_2021_2024_manual.csv", low_memory=False)

In [58]:
compustat_workflow_df.columns[:20]

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'acctchg', 'acctstd', 'acqmeth', 'adrr', 'ajex',
       'ajp', 'bspr', 'compst', 'curcd', 'curncd'],
      dtype='object')

In [59]:
compustat_workflow_df["gvkey"]

0         1602
1         1602
2         1602
3         2403
4         2403
         ...  
3046    351038
3047    351038
3048    353444
3049    353444
3050    353444
Name: gvkey, Length: 3051, dtype: int64

In [60]:
compustat_workflow_df["fyear"] = compustat_workflow_df["fyear"].astype(int)

In [65]:
final_merged_data_df = pd.merge(compustat_workflow_df, graph_ar_features, 
         how = "inner", left_on=["gvkey","fyear"], 
         right_on=["gvkey","year"])

In [66]:
final_merged_data_df.to_csv("final_data/computat_boardex_10k_all_merged.csv")

In [67]:
final_merged_data_df.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,acctchg,acctstd,acqmeth,adrr,ajex,ajp,bspr,compst,curcd,curncd,currtr,curuscn,final,fyr,ismod,ltcm,ogm,pddur,scf,src,stalt,udpl,upd,apdedate,fdate,pdate,acchg,acco,accrt,acdo,aco,acodo,acominc,acox,acoxar,acqao,acqcshi,acqgdwl,acqic,acqintan,acqinvt,acqlntal,acqniintc,acqppe,acqsc,act,adpac,aedi,afudcc,afudci,aldo,am,amc,amdc,amgw,ano,ao,aocidergl,aociother,aocipen,aocisecgl,aodo,aol2,aoloch,aox,ap,apalch,apb,apc,apofs,aqa,aqc,aqd,aqeps,aqi,aqp,aqpl1,aqs,arb,arc,arce,arced,arceeps,artfs,at,aul3,autxr,balr,banlr,bast,bastr,batr,bcef,bclr,bcltbl,bcnlr,bcrbl,bct,bctbl,bctr,billexce,bkvlps,bltbl,ca,capr1,capr2,capr3,caps,capsft,capx,capxv,cb,cbi,cdpac,cdvc,ceiexbill,ceq,ceql,ceqt,cfbd,cfere,cfo,cfpdo,cga,cgri,cgti,cgui,ch,che,chech,chs,ci,cibegni,cicurr,cidergl,cimii,ciother,cipen,cisecgl,citotal,cld2,cld3,cld4,cld5,clfc,clfx,clg,clis,cll,cllc,clo,clrll,clt,cmp,cnltbl,cogs,cpcbl,cpdoi,cpnli,cppbl,cprei,crv,crvnli,cshfd,cshi,csho,cshpri,cshr,cshrc,cshrp,cshrso,cshrt,cshrw,cstk,cstkcv,cstke,dbi,dc,dclo,dcom,dcpstk,dcs,dcvsr,dcvsub,dcvt,dd,dd1,dd2,dd3,dd4,dd5,depc,derac,deralt,derhedgl,derlc,derllt,dfpac,dfs,dfxa,diladj,dilavx,dlc,dlcch,dltis,dlto,dltp,dltr,dltsub,dltt,dm,dn,do,donr,dp,dpacb,dpacc,dpacli,dpacls,dpacme,dpacnr,dpaco,dpacre,dpact,dpc,dpdc,dpltb,dpret,dpsc,dpstb,dptb,dptc,dptic,dpvieb,dpvio,dpvir,drc,drci,drlt,ds,dt,dtea,dted,dteeps,dtep,dudd,dv,dvc,dvdnp,dvintf,dvp,dvpa,dvpd,dvpdp,dvpibb,dvrpiv,dvrre,dvsco,dvt,dxd2,dxd3,dxd4,dxd5,ea,ebit,ebitda,eiea,emol,emp,epsfi,epsfx,epspi,epspx,esopct,esopdlt,esopnr,esopr,esopt,esub,esubc,excadj,exre,fatb,fatc,fatd,fate,fatl,fatn,fato,fatp,fca,fdfr,fea,fel,ffo,ffs,fiao,finaco,finao,fincf,finch,findlc,findlt,finivst,finlco,finlto,finnp,finrecc,finreclt,finrev,finxint,finxopr,fopo,fopox,fopt,fsrco,fsrct,fuseo,fuset,gbbl,gdwl,gdwlam,gdwlia,gdwlid,gdwlieps,gdwlip,geqrv,gla,glcea,glced,glceeps,glcep,gld,gleps,gliv,glp,govgr,govtown,gp,gphbl,gplbl,gpobl,gprbl,gptbl,gwo,hedgegl,iaeq,iaeqci,iaeqmi,iafici,iafxi,iafxmi,iali,ialoi,ialti,iamli,iaoi,iapli,iarei,iasci,iasmi,iassi,iasti,iatci,iati,iatmi,iaui,ib,ibadj,ibbl,ibc,ibcom,ibki,ibmii,icapt,idiis,idilb,idilc,idis,idist,idit,idits,iire,initb,intan,intano,intc,intpn,invch,invfg,invo,invofs,invreh,invrei,invres,invrm,invt,invwip,iobd,ioi,iore,ip,ipabl,ipc,iphbl,iplbl,ipobl,iptbl,ipti,ipv,irei,irent,irii,irli,irnli,irsi,iseq,iseqc,iseqm,isfi,isfxc,isfxm,isgr,isgt,isgu,islg,islgc,islgm,islt,isng,isngc,isngm,isotc,isoth,isotm,issc,issm,issu,ist,istc,istm,isut,itcb,itcc,itci,ivaco,ivaeq,ivao,ivch,ivgod,ivi,ivncf,ivpt,ivst,ivstch,lcabg,lcacl,lcacr,lcag,lcal,lcalt,lcam,lcao,lcast,lcat,lco,lcox,lcoxar,lcoxdr,lct,lcuacu,li,lif,lifr,lifrp,lloml,lloo,llot,llrci,llrcr,llwoci,llwocr,lno,lo,lol2,loxdr,lqpl1,lrv,ls,lse,lst,lt,lul3,mib,mibn,mibt,mii,mrc1,mrc2,mrc3,mrc4,mrc5,mrct,mrcta,msa,msvrv,mtl,nat,nco,nfsr,ni,niadj,nieci,niint,niintpfc,niintpfp,niit,nim,nio,nipfc,nipfp,nit,nits,nopi,nopio,np,npanl,npaore,nparl,npat,nrtxt,nrtxtd,nrtxteps,oancf,ob,oiadp,oibdp,opeps,opili,opincar,opini,opioi,opiri,opiti,oprepsx,optca,optdr,optex,optexd,optfvgr,optgr,optlife,optosby,optosey,optprcby,optprcca,optprcex,optprcey,optprcgr,optprcwa,optrfr,optvol,palr,panlr,patr,pcl,pclr,pcnlr,pctr,pdvc,pi,pidom,pifo,pll,pltbl,pnca,pncad,pncaeps,pncia,pncid,pncieps,pncip,pncwia,pncwid,pncwieps,pncwip,pnlbl,pnli,pnrsho,pobl,ppcbl,ppegt,ppenb,ppenc,ppenli,ppenls,ppenme,ppennr,ppeno,ppent,ppevbb,ppeveb,ppevo,ppevr,pppabl,ppphbl,pppobl,ppptbl,prc,prca,prcad,prcaeps,prebl,pri,prodv,prsho,prstkc,prstkcc,prstkpc,prvt,pstk,pstkc,pstkl,pstkn,pstkr,pstkrv,ptbl,ptran,pvcl,pvo,pvon,pvpl,pvt,pwoi,radp,ragr,rari,rati,rca,rcd,rceps,rcl,rcp,rdip,rdipa,rdipd,rdipeps,rdp,re,rea,reajo,recch,recco,recd,rect,recta,rectr,recub,ret,reuna,reunr,revt,ris,rll,rlo,rlp,rlri,rlt,rmum,rpag,rra,rrd,rreps,rrp,rstche,rstchelt,rvbci,rvbpi,rvbti,rvdo,rvdt,rveqt,rvlrv,rvno,rvnt,rvri,rvsi,rvti,rvtxr,rvupi,rvutx,saa,sal,sale,salepfc,salepfp,sbdc,sc,sco,scstkc,secu,seq,seqo,seta,setd,seteps,setp,siv,spce,spced,spceeps,spi,spid,spieps,spioa,spiop,sppe,sppiv,spstkc,sret,srt,ssnp,sstk,stbo,stio,stkco,stkcpa,tdc,tdscd,tdsce,tdsg,tdslg,tdsmm,tdsng,tdso,tdss,tdst,teq,tf,tfva,tfvce,tfvl,tie,tii,tlcf,transa,tsa,tsafc,tso,tstk,tstkc,tstkme,tstkn,tstkp,txach,txbco,txbcof,txc,txdb,txdba,txdbca,txdbcl,txdc,txdfed,txdfo,txdi,txditc,txds,txeqa,txeqii,txfed,txfo,txndb,txndba,txndbl,txndbr,txo,txp,txpd,txr,txs,txt,txtubadjust,txtubbegin,txtubend,txtubmax,txtubmin,txtubposdec,txtubposinc,txtubpospdec,txtubpospinc,txtubsettle,txtubsoflimit,txtubtxtr,txtubxintbs,txtubxintis,txva,txw,uaoloch,uaox,uapt,ucaps,uccons,uceq,ucustad,udcopres,udd,udfcc,udmb,udolt,udpco,udpfa,udvp,ufretsd,ugi,ui,uinvt,ulcm,ulco,uniami,unl,unnp,unnpl,unopinc,unwcc,uois,uopi,uopres,updvp,upmcstk,upmpf,upmpfs,upmsubp,upstk,upstkc,upstksf,urect,urectr,urevub,uspi,ustdnc,usubdvp,usubpstk,utfdoc,utfosc,utme,utxfed,uwkcapc,uxinst,uxintd,vpac,vpo,wcap,wcapc,wcapch,wda,wdd,wdeps,wdp,xacc,xad,xago,xagt,xcom,xcomi,xdepl,xdp,xdvre,xeqo,xi,xido,xidoc,xindb,xindc,xins,xinst,xint,xintd,xintopt,xivi,xivre,xlr,xnbi,xnf,xnins,xnitb,xobd,xoi,xopr,xoprar,xoptd,xopteps,xore,xpp,xpr,xrd,xrdp,xrent,xs,xsga,xstf,xstfo,xstfws,xt,xuw,xuwli,xuwnli,xuwoi,xuwrei,xuwti,exchg,cik_x,costat,fic,naicsh,sich,cshtr_c,dvpsp_c,dvpsx_c,prcc_c,prch_c,prcl_c,adjex_c,cshtr_f,dvpsp_f,dvpsx_f,mkvalt,prcc_f,prch_f,prcl_f,adjex_f,rank,au,auop,auopic,ceoso,cfoso,add1,add2,add3,add4,addzip,busdesc,city,conml,county,dlrsn,ein,fax,fyrc,ggroup,gind,gsector,gsubind,idbflag,incorp,loc,naics,phone,prican,prirow,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,dldte,ipodate,year,local_clustering_coef,degree_centrality,betweenness_centrality,graph_density,cik_y,report_year,risk_sentiment,business_overview_sentiment,risk_topic_0_allocation,risk_topic_1_allocation,risk_topic_2_allocation,risk_topic_3_allocation
0,1602,12/31/2022,2022,INDL,C,D,STD,AMGN,31162100,AMGEN INC,,DS,AP,,1.0,1.0,,AA,USD,USD,1.0,,Y,12.0,1.0,9.0,,12.0,7.0,53.0,,,3.0,12/31/2022,02/10/2023,01/31/2023,0.0,,,0.0,1559.0,355.0,-231.0,355.0,,,,667.0,,3497.0,,,,,,22186.0,,,,,0.0,2600.0,,,,0.0,1823.0,128.0,-11.0,0.0,0.0,1823.0,471.0,601.0,1823.0,1572.0,154.0,,,,42.25,3839.0,0.08,0.08,,65.0,4815.0,,,,,,,,65121.0,335.0,,,,,,,,,,,,,,,,6.8558,,,,,,32513.947,,936.0,936.0,,,,,,3661.0,3661.0,-27948.0,,,,,,,,,7629.0,9305.0,-360.0,,7117.0,6552.0,496.0,67.0,0.0,2.0,0.0,0.0,7117.0,,,,,,,,,,,,,,,,3145.0,,,,,,,,541.0,534.0,534.0,538.0,4.838,,,,,,0.053,0.0001,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,757.0,1747.0,1400.0,1500.0,2627.0,2724.0,,,,-233.0,,,,,,0.0,6552.0,1747.0,,6919.0,695.0,6650.0,,,37893.0,0.0,39859.0,0.0,0.0,3261.0,,,,,,,,,9283.0,,,,,,,,,,9283.0,,,0.0,,0.0,0.0,39640.0,50.7,0.09,0.09,78.0,-1671.0,4196.0,4264.0,,,0.0,0.0,,,,,,,4264.0,1400.0,1500.0,2627.0,2724.0,,10060.0,13321.0,,,25.2,12.11,12.11,12.18,12.18,0.0,,0.0,0.0,0.0,,891.0,,0.0,4201.0,1213.0,,6860.0,579.0,0.0,2144.0,292.0,,,,,,,-400.0,,,-4037.0,,,,,,,,,,,,,4209.0,4209.0,,,,,,,15529.0,,,,,,,,,,,,,,165.0,,,,23178.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6552.0,6552.0,,6552.0,6552.0,,6552.0,41554.0,,,,,,,,,,31609.0,16080.0,,1200.0,-742.0,1004.0,0.0,,,,,828.0,4930.0,3098.0,,,,,,,,,,,,,,32.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,118.0,2784.0,713.0,2605.0,,,-6044.0,,1676.0,,,,,,,,,,,,11173.0,1137.0,,1137.0,15687.0,,,,0.0,0.0,,,,,,,,0.0,7880.0,1398.0,7880.0,0.0,,,65121.0,,61460.0,270.0,0.0,0.0,0.0,0.0,172.0,109.0,80.0,70.0,64.0,495.0,286.0,0.0,,,,,,6552.0,6552.0,,,,,,,,,,,,-892.0,-892.0,0.0,,,,,,,,9721.0,,10060.0,13321.0,12.79,,,,,,,12.72,0.2,3.3,2.2,0.7,42.43,1.1,5.7,5.1,5.3,197.27,226.35,167.44,207.29,230.92,177.48,2.8,24.5,,,,,,,,,7346.0,3026.0,4320.0,,,,,,,,,,,,,,,,0.0,,,15289.0,,,,,,,,6006.0,,15289.0,,,,,,,,,,,,,,0.0,6360.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,5.2,0.01,0.01,,8.0,,,,,,-28853.0,0.0,117.0,-746.0,700.0,22.0,6392.0,-348.0,5563.0,,,-28622.0,,26323.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26323.0,,,,,,,,3661.0,0.0,,,,,1218.0,,,,-416.0,,,,,,0.0,,,,,0.0,,,401.0,315.0,,,,,,,,,,,3661.0,,5621.0,,1668.0,,,3736.0,,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,2069.0,,,0.0,0.0,-1198.0,-1185.0,-63.0,-1275.0,,-27.0,,,1721.0,304.0,943.0,2819.0,1876.0,0.0,0.0,1195.0,2400.0,129.0,44.0,794.0,0.0,3546.0,3770.0,,,0.0,151.0,14.0,90.0,0.0,3.0,3770.0,1100.0,189.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6499.0,,,-368.55,-0.68,-0.69,-567.0,10036.0,841.0,,,,,,,,,0.0,0.0,0.0,,,,,1406.0,,,,,,,,,,,,13002.0,,,,,1204.0,243.0,4434.0,6324.0,218.0,,9857.0,,,,,,,,,,,14,318154.0,A,USA,325414.0,2836.0,747402000.0,7.76,7.76,262.64,296.67,214.3915,1.0,747402000.0,7.76,7.76,140249.76,262.64,296.67,214.3915,1.0,1.0,4.0,1.0,1.0,Y,Y,One Amgen Center Drive,,,,91320-1799,"Amgen Inc. discovers, develops, manufactures, ...",Thousand Oaks,Amgen Inc,,,95-3540776,805 447 1010,12,3520.0,352010.0,35.0,35201010.0,B,DE,USA,325414.0,805 447 1000,,,1.0,2836,147.0,905.0,A-,CA,0,www.amgen.com,,,2022,0.428352,0.001668,0.000954,0.003216,318154.0,2022,-1,-1,0.064385,0.926407,0.004403,0.004806
1,1602,12/31/2023,2023,INDL,C,D,STD,AMGN,31162100,AMGEN INC,,DS,AP,,1.0,1.0,,AA,USD,USD,1.0,,Y,12.0,1.0,9.0,,12.0,7.0,53.0,,,3.0,12/31/2023,02/19/2024,02/06/2024,0.0,,,0.0,1928.0,281.0,-289.0,281.0,,,,3111.0,-1200.0,20650.0,5025.0,,,318.0,955.0,30332.0,,,,,0.0,3200.0,,,,0.0,3903.0,-22.0,31.0,0.0,0.0,3903.0,283.0,-493.0,3903.0,1590.0,533.0,,,,-273.0,26989.0,-0.51,-0.51,-4171.0,-420.0,14780.0,3641.0,,,,,,,97154.0,0.0,,,,,,,,,,,,,,,,11.6399,,,,,,33069.946,,1112.0,1112.0,,,,,,6232.0,6232.0,-45038.0,,,,,,,,,10944.0,10944.0,3315.0,,6659.0,6717.0,50.0,-150.0,0.0,42.0,0.0,0.0,6659.0,,,,,,,,,,,,,,,,4530.0,,,,,,,,538.0,535.4,535.4,535.0,4.614,,,,,,0.054,0.0001,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1562.0,5500.0,6183.0,2724.0,4984.0,,,,42.0,,,,,,0.0,6717.0,1562.0,,27777.0,4691.0,6650.0,1454.0,,63861.0,0.0,60887.0,0.0,0.0,3885.0,,,,,,,,,9808.0,,,,,,,,,,9808.0,,,0.0,,0.0,0.0,65423.0,146.25,0.27,0.27,225.0,-1717.0,4556.0,4644.0,,,0.0,0.0,,,,,,,4644.0,5500.0,6183.0,2724.0,4984.0,,9269.0,13154.0,,,26.7,12.49,12.49,12.56,12.56,0.0,,0.0,0.0,0.0,,11.0,,0.0,4507.0,1550.0,,7092.0,651.0,0.0,2261.0,339.0,,,,,,,-719.0,,,21048.0,,,,,,,,,,,,,3500.0,3500.0,,,,,,,18629.0,,,,,,,780.0,,,,,1.45,1.46,1298.0,1200.0,,,23660.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6717.0,6717.0,,6717.0,6717.0,,6717.0,70093.0,,,,,,,,,,51270.0,32641.0,,2400.0,491.0,2778.0,0.0,,,,,993.0,9518.0,5747.0,,,,,,,,,,,,,,28.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,225.0,854.0,4203.0,1.0,,,-26204.0,,0.0,,,,,,,,,,,,13576.0,1205.0,,1205.0,18392.0,,,,0.0,0.0,,,,,,,,0.0,6315.0,1092.0,6315.0,0.0,,,97154.0,,90922.0,96.0,0.0,0.0,0.0,0.0,138.0,121.0,109.0,95.0,74.0,537.0,440.0,0.0,,,,,,6717.0,6717.0,,,,,,,,5383.0,2381.0,,,1506.0,1506.0,0.0,,,,,,,,8471.0,,9269.0,13154.0,12.62,,,,,,,12.55,0.1,3.5,2.8,0.4,41.86,1.1,5.7,5.3,5.9,207.29,234.1,182.33,213.9,235.97,190.59,3.4,23.3,,,,,,,,,7855.0,4047.0,3808.0,,,,,,,,,,,,,,,,0.0,,,16400.0,,,,,,,,6592.0,,16400.0,,,,,,,,,,,,,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,-173.55,-0.32,-0.32,,-267.0,-783.0,-508.95,-0.95,-0.95,,-26838.0,0.0,9.0,-1015.0,502.0,28.0,7942.0,-298.0,7268.0,,,-26549.0,,28190.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28190.0,30969.0,29964.0,,,,,,6232.0,0.0,,,,,1673.0,,,,-45.0,,,,,0.0,,,,,,0.0,,,431.0,371.0,,,,,,,,,,,6232.0,,15063.0,,1188.0,,,3041.0,,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,2353.0,2354.0,2800.0,0.0,0.0,-1273.0,-1124.0,-66.0,-1215.0,2354.0,-25.0,,,1524.0,786.0,446.0,4554.0,4108.0,0.0,0.0,1664.0,3400.0,172.0,43.0,1138.0,0.0,3770.0,4012.0,,,0.0,196.0,0.0,56.0,6.0,4.0,4012.0,1400.0,287.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11940.0,,,,,,,12371.0,647.0,,,,,,,,,0.0,0.0,0.0,,,,,2875.0,,,,,,,,,,,,15036.0,,,,,1647.0,311.0,5567.0,4434.0,208.0,,10506.0,,,,,,,,,,,14,318154.0,A,USA,325414.0,2836.0,614271100.0,8.52,8.52,288.02,289.04,211.71,1.0,614271100.0,8.52,8.52,154205.908,288.02,289.04,211.71,1.0,1.0,4.0,1.0,1.0,Y,Y,One Amgen Center Drive,,,,91320-1799,"Amgen Inc. discovers, develops, manufactures, ...",Thousand Oaks,Amgen Inc,,,95-3540776,805 447 1010,12,3520.0,352010.0,35.0,35201010.0,B,DE,USA,325414.0,805 447 1000,,,1.0,2836,147.0,905.0,A-,CA,0,www.amgen.com,,,2023,0.15665,0.002396,0.001115,0.004495,318154.0,2023,0,-1,0.087753,0.905426,0.002893,0.003929
2,2403,12/31/2022,2022,INDL,C,D,STD,BMY,110122108,BRISTOL-MYERS SQUIBB CO,,DS,,,1.0,1.0,,,USD,USD,1.0,,Y,12.0,1.0,9.0,,12.0,7.0,5.0,,,3.0,12/31/2022,02/17/2023,,0.0,,,0.0,1596.0,1596.0,-1281.0,1596.0,,10.0,0.0,695.0,,2971.0,,,,,,27273.0,,,,,0.0,9700.0,,,,0.0,2877.0,232.0,0.0,-623.0,0.0,2877.0,8885.0,-183.0,2877.0,3040.0,109.0,,,,-555.1,,-0.26,-0.26,,-854.0,424.0,,,,,,,,96820.0,0.0,,,,,,,,,,,,,,,,14.8051,,,,,,45165.0,499.0,1118.0,1118.0,,,,,504.0,31061.0,31061.0,-25947.0,,,,,,,,,9123.0,9401.0,-4991.0,,6332.0,6345.0,-210.0,54.0,18.0,0.0,145.0,-2.0,6314.0,,,,,,,,,,,,,,,,9094.0,,,,,,,,2146.0,2923.0,2098.0,2130.0,32.895,,,,,,292.0,0.1,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,4033.0,2873.0,1842.0,2006.0,2046.0,,,,148.0,,,,,,0.0,6327.0,4400.0,194.0,5926.0,1412.0,0.0,11431.0,,36317.0,0.0,38219.0,0.0,0.0,10282.0,,,,,,,,,4164.0,10276.0,,,,,,,,,4164.0,,,,,283.0,0.0,40350.0,-172.9,-0.08,-0.08,-266.0,719.0,4634.0,4644.0,,,0.0,0.0,,,,,,,4644.0,2873.0,1842.0,2006.0,2046.0,,9694.0,19976.0,,,34.3,2.95,2.95,2.97,2.97,0.0,,0.0,0.0,0.0,-108.0,,,-33.0,5920.0,1053.0,,3284.0,1220.0,0.0,0.0,162.0,,,,,,,984.0,,,-16962.0,,,,,,,,,,,,,1412.0,1412.0,,,,,,,21149.0,,,,,,,,,,,,,,-693.0,,,,37065.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6327.0,6327.0,,6345.0,6327.0,,6345.0,67435.0,,,,,,,,,,57008.0,35859.0,,1400.0,-69.0,,,,,,,,2339.0,,,,,,,,,,,,,,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,142.0,-2981.0,546.0,1641.0,3592.0,,,-1062.0,,278.0,,,,,,,,,,,,13508.0,3270.0,,3270.0,21890.0,,,,0.0,0.0,,,,,,,,0.0,5329.0,213.0,5046.0,5.0,,,96820.0,,65702.0,24.0,0.0,57.0,57.0,18.0,187.0,191.0,169.0,149.0,145.0,841.0,933.0,0.0,,,,,,6327.0,6327.0,,,,,,,,,,,,1606.0,1606.0,367.0,,,,,,,,13066.0,,9694.0,19976.0,3.84,,,,,,,3.82,0.8,,21.9,24.3,,0.0,,47.0,21.9,53.0,58.7,50.79,55.25,,55.25,,,,,,,,,,,7713.0,-140.0,7853.0,,,,,,,,,,,,,,,,0.003,,,11639.0,,,,,,,,7475.0,,11639.0,,,,,,,,,,,,,,0.0,8001.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,-93.6,-0.04,-0.04,,-144.0,-913.0,-593.45,-0.28,-0.28,,24222.0,0.0,-391.0,-663.0,1735.0,22.0,13937.0,-890.0,8655.0,504.0,,25503.0,,46159.0,,,,,,,,,,,,,148.0,54.0,,,,,,,,,,,,,,,,,,46159.0,,,,,,,,31061.0,0.0,-115.7,-0.05,-0.05,-178.0,6629.0,,,,-2355.0,,,,,,,,,,,,,,457.0,366.0,,,,,,,,,,,31118.0,,9309.0,,242.0,,,,,,,,38618.0,38618.0,,825.0,0.0,-1423.0,0.0,0.0,,2166.0,1344.0,0.0,0.0,-2738.0,,151.0,-2738.0,2166.0,,,,,1089.0,-822.0,4145.0,4967.0,0.0,0.0,942.0,5400.0,3547.0,,1368.0,6.0,2042.0,1766.0,-170.0,-120.0,0.0,53.0,381.0,137.0,8.0,83.0,1736.0,357.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5383.0,,,,,,,10238.0,1300.0,,,,,,,,,0.0,0.0,0.0,,,,,1232.0,,,,,,,,,,,,26183.0,,,,,0.0,27.0,10324.0,11354.0,299.0,,17089.0,,,,,,,,,,,11,14272.0,A,USA,325412.0,2834.0,3017324000.0,2.16,2.16,71.95,81.435,60.86,1.0,3017324000.0,2.16,2.16,150951.1,71.95,81.435,60.86,1.0,1.0,5.0,1.0,1.0,Y,Y,Route 206 & Province Line Road,,,,8543,"Bristol-Myers Squibb Company discovers, develo...",Princeton,Bristol-Myers Squibb Co,,,22-0790350,,12,3520.0,352020.0,35.0,35202010.0,B,DE,USA,325412.0,609 252 4621,,,1.0,2834,280.0,905.0,B,NJ,0,www.bms.com,,,2022,0.06087,0.001618,3e-06,0.003216,14272.0,2022,-1,-1,0.3002,0.692749,0.005157,0.001895
3,2403,12/31/2023,2023,INDL,C,D,STD,BMY,110122108,BRISTOL-MYERS SQUIBB CO,,DS,,,1.0,1.0,,,USD,USD,1.0,,Y,12.0,1.0,9.0,,12.0,7.0,53.0,,,3.0,12/31/2023,02/13/2024,,0.0,,,0.0,1509.0,1509.0,-1546.0,1509.0,,,,,,,,,,,,31770.0,,,,,0.0,9200.0,,,,0.0,4613.0,2.0,0.0,-738.0,0.0,4613.0,10029.0,237.0,4613.0,3259.0,198.0,,,,-206.7,,-0.1,-0.1,,-318.0,318.0,,,,,,,,95159.0,0.0,,,,,,,,,,,,,,,,14.5621,,,,,,45684.0,470.0,1209.0,1209.0,,,,,416.0,29430.0,29430.0,-18811.0,,,,,,,,,11464.0,12335.0,2194.0,,7775.0,8040.0,78.0,-230.0,15.0,0.0,-115.0,2.0,7760.0,,,,,,,,,,,,,,,,9796.0,,,,,,,,2078.0,2923.0,2021.0,2069.0,31.207,,,,,,292.0,0.1,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,3035.0,1865.0,2006.0,2046.0,1500.0,,,,160.0,,,,,,0.0,8025.0,3281.0,-120.0,4455.0,1692.0,0.0,3879.0,,38183.0,0.0,38886.0,0.0,0.0,9769.0,,,,,,,,,4803.0,9760.0,,,,,,,,,4803.0,,,,,300.0,0.0,41218.0,,,,,640.0,4744.0,4762.0,,,0.0,0.0,,,,,,,4762.0,1865.0,2006.0,2046.0,1500.0,,8556.0,18325.0,,,34.1,3.86,3.86,3.88,3.88,0.0,,0.0,0.0,0.0,-38.0,,,45.0,6495.0,1075.0,,3717.0,1390.0,0.0,0.0,162.0,,,,,,,27.0,,,-9416.0,,,,,,,,,,,,,1262.0,1262.0,,,,,,,21169.0,,,,,,,,,,,,,,-122.0,,,,35210.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8025.0,8025.0,,8040.0,8025.0,,8040.0,67668.0,,,,,,,,,,48241.0,27072.0,,1200.0,-751.0,,,,,,,,2662.0,,,,,,,,,,,,,,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,243.0,-260.0,542.0,1957.0,1774.0,,,-2295.0,,871.0,,,,,,,,,,,,14351.0,3309.0,,3309.0,22262.0,,,,0.0,0.0,,,,,,,,0.0,4891.0,160.0,4591.0,4.0,,,95159.0,,65674.0,8.0,0.0,55.0,55.0,15.0,225.0,236.0,211.0,205.0,192.0,1069.0,1061.0,2.0,,,,,,8025.0,8025.0,,,,,,,,,,,,2533.0,2533.0,246.0,,,,,656.0,0.32,0.32,13860.0,,8556.0,18325.0,4.13,,,,,,,4.11,0.9,,16.2,4.8,,0.0,,21.9,16.2,55.25,63.49,46.79,57.34,,57.34,,,,,,,,,,,8440.0,2624.0,5816.0,,,,,,,,,,,,,,,,0.003,,,12839.0,,,,,,,,8036.0,,12839.0,,,,,,,,,,,,,,0.0,5155.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,-347.75,-0.17,-0.17,,-535.0,-993.0,-645.45,-0.31,-0.31,,27220.0,0.0,-736.0,-995.0,2039.0,23.0,15264.0,-812.0,9298.0,416.0,,28766.0,,45006.0,,,,,,,,,,,,,55.0,0.0,,,,,,,,,,,,,,,,,,45006.0,,,,,,,,29430.0,0.0,253.5,0.12,0.12,390.0,948.0,,,,-1483.0,,,,,,,,,,,,,,518.0,413.0,,,,,,,,,,,29485.0,,10347.0,,172.0,,,,,,,,43766.0,43766.0,,902.0,0.0,-603.0,0.0,0.0,,338.0,2768.0,0.0,0.0,-3288.0,,-949.0,-3288.0,338.0,,,,,943.0,2430.0,7334.0,4904.0,0.0,0.0,1371.0,4300.0,3927.0,,400.0,4.0,1766.0,1914.0,-140.0,-100.0,0.0,38.0,5.0,145.0,30.0,4.0,1872.0,457.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9508.0,,,-17.55,-0.01,-0.01,-27.0,11042.0,1400.0,,,,,,,,,0.0,0.0,0.0,,,,,1166.0,,,,,,,,,,,,26681.0,,,,,0.0,11.0,10212.0,10324.0,416.0,,16885.0,,,,,,,,,,,11,14272.0,A,USA,325412.0,2834.0,2573558000.0,2.28,2.28,51.31,75.18,48.252,1.0,2573558000.0,2.28,2.28,103697.51,51.31,75.18,48.252,1.0,1.0,5.0,1.0,1.0,Y,Y,Route 206 & Province Line Road,,,,8543,"Bristol-Myers Squibb Company discovers, develo...",Princeton,Bristol-Myers Squibb Co,,,22-0790350,,12,3520.0,352020.0,35.0,35202010.0,B,DE,USA,325412.0,609 252 4621,,,1.0,2834,280.0,905.0,B,NJ,0,www.bms.com,,,2023,0.25641,0.003035,7e-06,0.004495,14272.0,2023,-1,-1,0.261239,0.731864,0.00472,0.002177
4,2990,12/31/2022,2022,INDL,C,D,STD,IMNN,15117N602,IMUNON INC,,DS,,,1.0,1.0,,,USD,USD,1.0,,Y,12.0,3.0,9.0,,12.0,7.0,5.0,,,3.0,12/31/2022,04/19/2023,04/17/2023,0.0,,,0.0,2.778,2.778,0.027,2.778,,,,,,,,,,,,37.155,,,,,0.0,0.0,,,,0.0,6.117,0.0,0.0,0.0,0.0,6.117,0.0,-0.881,6.117,3.587,2.112,,,,3.507,0.0,0.49,0.49,,5.396,0.0,,,,,,,,43.976,21.254,,,,,,,,,,,,,,,0.0,3.9441,,,,,,397.98,,0.268,0.268,,,,,,29.328,29.328,29.328,,,,,,,,,11.493,34.248,-8.093,,-35.872,-35.898,0.0,0.0,0.0,0.0,0.0,0.026,-35.872,,,,,,,,,,,,,,,,25.225,,,,,,,,7.143,7.436,7.436,7.143,28.0,,,,,,0.074,0.01,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1.656,3.0,,,,,,,,,,,,,0.0,-35.898,1.656,0.0,0.0,0.0,4.611,0.0,,4.611,4.611,4.611,0.0,0.0,0.197,,,,,,,,,2.644,0.913,,,,,,,,,2.644,,,0.0,,0.0,0.0,6.267,-2.959,-0.41,-0.41,-4.552,0.0,0.0,0.0,,,0.0,0.0,,,,,,,0.0,3.0,,,,,-24.922,-24.725,,,0.031,-5.03,-5.03,-5.03,-5.03,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,2.819,0.529,0.0,0.0,0.0,,,,,,,0.0,,,6.723,,,,,,,,,,,,,10.643,10.643,,,,,,,0.0,,,,,,,,,,,,,,-0.049,,,,-24.725,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-35.898,-35.898,,-35.898,-35.898,,-35.898,33.939,,,,,,,,,,0.0,0.0,0.0,4.847,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,8.315,,22.755,8.583,,,,,,,,,,,4.794,0.0,,0.0,10.037,,,,0.0,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,,,43.976,,14.648,0.0,0.0,0.0,0.0,0.0,0.239,0.0,0.0,0.0,0.0,0.239,0.0,0.027,,,,,,-35.898,-35.898,,,,,,,,,,,,0.456,0.456,0.0,,,,,,,,-23.131,,-24.922,-24.725,-3.64,,,,,,,-3.64,0.397,0.0,0.202,0.0,,0.716,8.75,0.441,0.76,38.5,39.06,,4.55,2.72,8.07,2.855,106.95,,,,,,,,,-37.465,,,,,,,,,,,,,,,,,,0.0,,,3.348,,,,,,,,0.704,,3.348,,,,,,,,,,,,,,0.0,28.5,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,-13.366,-8.688,-1.22,-1.22,,-368.641,0.0,0.0,-0.02,0.129,,0.129,0.0,0.0,0.0,,-368.668,,0.5,,,,,,,,,,,,,1.5,4.5,,,,,,,,,,,,,,,,,,0.5,,,,,,,,29.328,0.0,,,,,0.0,,,,-12.522,,,,,0.0,0.0,,,,,35.223,,,2.673,,,,,,,,,,,,29.328,,21.254,0.0,0.0,,,330.0,,,,,0.085,0.085,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.567,0.0,0.0,0.0,0.0,0.0,-1.567,0.0,-1.567,,,0.0,0.0,1.567,1.567,0.0,0.0,0.0,0.0,0.002,0.0,0.0,-1.567,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27.118,,,,,,,4.794,,,,,,,,,,0.0,0.0,0.0,,,,,0.477,0.5,,,,,,,,,,,25.225,,,,,0.0,0.117,25.1,10.619,0.588,,,,,,,,,,,,,14,749647.0,A,USA,325414.0,2836.0,29857720.0,0.0,0.0,1.35,9.3,1.26,1.0,29857720.0,0.0,0.0,10.0386,1.35,9.3,1.26,1.0,1.0,9.0,1.0,0.0,Y,Y,"997 Lenox Drive, Suite 100",,,,8648,"Imunon, Inc., a clinical-stage biotechnology c...",Lawrenceville,Imunon Inc,,,52-1256615,609 896 2200,12,3520.0,352010.0,35.0,35201010.0,D,DE,USA,325414.0,609 896 9100,,,1.0,2836,300.0,905.0,C,NJ,0,imunon.com,,03/05/1985,2022,1.0,0.011223,0.0,0.003216,749647.0,2022,-1,-1,0.144905,0.110536,0.558879,0.185679


In [68]:
final_merged_data_df.shape

(1136, 994)

Around 1000 rows so it works fine.

In [69]:
final_merged_data_df.fyear.value_counts()

fyear
2022    653
2023    483
Name: count, dtype: int64

In [None]:
# 2 years of data which is fine