# Research Question

How has NIH’s Cure Sickle Cell Initiative impacted research funding distribution and research focus?

## Dependencies

In [3]:
# Install package for natural language processing
%pip install nltk

# data manipulation
import pandas as pd
import numpy as np
import os

# text analysis tools
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing
from nltk import SnowballStemmer
import string

# visualization tools
import matplotlib as mplib
import matplotlib.pyplot as plt 
import seaborn as sns  

%autosave 60

Note: you may need to restart the kernel to use updated packages.


Autosaving every 60 seconds


In [4]:
# Convert scientific notation to a full float
pd.set_option('display.float_format', '{:.2f}'.format)

# Get Path

In [5]:
# Convert scientific notation to a full float
pd.set_option('display.float_format', '{:.2f}'.format)

In [6]:
# Set the path to the "Data" folder by replacing "Notebooks" in the current directory path
path = str(os.getcwd()).replace("Notebooks", "Data")

# If the data has been moved, manually set the path below (not recommended for reproducibility)
# path = '...'  # Uncomment and update if needed

# Print the path to verify correctness (varies by computer)
print(path)

C:\Users\NAjani\Jupyter\Wagner\SickleCellProject


In [11]:
# read in a csv file of grants using pd.read_csv() function
grants_2021 = pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY2021.csv', encoding='latin-1')

# Get an idea of what data looks like by only using 2012 data

In [12]:
# See first 5 rows with head() function
grants_2021.head(5)

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,...,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT
0,10595864,U54,DK,6.0,N,06/13/2022,04/01/2022,07/31/2022,,U54DK106829,...,106829.0,ZDK1,Special Emphasis Panel,7612.0,,7.0,42060.0,31955.0,,74015.0
1,10101643,R01,DA,5.0,N,02/22/2021,03/01/2021,02/28/2022,279.0,R01DA046197,...,46197.0,ZRG1,Special Emphasis Panel,,,4.0,451257.0,167187.0,618444.0,
2,10189622,U18,FD,5.0,N,06/08/2021,06/01/2021,05/31/2022,103.0,U18FD006442,...,6442.0,ZFD1,Special Emphasis Panel,,,4.0,,,74000.0,
3,10189608,U18,FD,5.0,N,06/01/2021,06/01/2021,05/31/2022,103.0,U18FD006164,...,6164.0,ZFD1,Special Emphasis Panel,,,5.0,,,52000.0,
4,10076833,R01,EY,5.0,N,01/11/2021,01/01/2021,12/31/2021,867.0,R01EY015240,...,15240.0,BVS,Biology of the Visual System Study Section,,,16.0,335775.0,204822.0,540597.0,


# Filter data from NHLBI from all years

In [23]:
# Initialize an empty DataFrame to hold the combined CRISPR data
crispr_db = pd.DataFrame()

# Iterating through data from each year to combine all CRISPR projects into a single dataframe
years = ["2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021"]

for year in years:
    proj = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY' + year + '.csv', encoding='latin-1'))
    ab = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Abstracts/RePORTER_PRJABS_C_FY' + year + '.csv', encoding='latin-1'))
    
    # Merge project and abstract data on APPLICATION_ID for each year
    db = pd.merge(proj, ab, on="APPLICATION_ID", how="inner")
    
    # Filter to only include rows where IC_NAME is 'NATIONAL HEART, LUNG, AND BLOOD INSTITUTE'
    db = db[db['IC_NAME'] == 'NATIONAL HEART, LUNG, AND BLOOD INSTITUTE']
    
    # Add a YEAR column for the respective year
    db["YEAR"] = int(year)
    
    # Concatenate the current year's CRISPR data with the accumulated data
    NHLBI_db = pd.concat([crispr_db, db])

# Display the combined NHLBI data
NHLBI_db

  proj = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY' + year + '.csv', encoding='latin-1'))
  proj = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY' + year + '.csv', encoding='latin-1'))
  proj = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY' + year + '.csv', encoding='latin-1'))
  proj = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY' + year + '.csv', encoding='latin-1'))
  proj = pd.DataFrame(pd.read_csv(path + '/WagnerData/Data/Projects/RePORTER_PRJ_C_FY' + year + '.csv', encoding='latin-1'))


Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,...,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT,ABSTRACT_TEXT,YEAR
12,10184066,R01,HL,1.00,N,04/19/2021,04/20/2021,03/30/2022,837.00,R01HL157378,...,Clinical and Integrative Cardiovascular Scienc...,,,1.00,293844.00,214506.00,508350.00,,The overall goal of this work is to address cl...,2021
22,10282172,R01,HL,3.00,N,02/04/2021,02/04/2021,01/31/2022,837.00,R01HL147811,...,Special Emphasis Panel,,S1,2.00,55704.00,38714.00,94418.00,,PROJECT SUMMARY / ABSTRACT This application is...,2021
27,10210292,R01,HL,5.00,N,07/03/2021,07/01/2021,06/30/2022,838.00,R01HL142578,...,Respiratory Integrative Biology and Translatio...,,,3.00,256698.00,123429.00,380127.00,,Project Summary: Angiogenesis ? the formation ...,2021
33,10128204,UM1,HL,5.00,N,02/10/2021,03/01/2021,02/28/2022,837.00,UM1HL147371,...,Special Emphasis Panel,,,3.00,571028.00,182571.00,559654.00,,PROJECT SUMMARY The Cardiothoracic Surgical Tr...,2021
42,10250453,P01,HL,5.00,N,07/12/2021,07/01/2021,06/30/2022,,P01HL108800,...,Special Emphasis Panel,8770.00,,11.00,398400.00,228433.00,,626833.00,Project Summary The strongest established risk...,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81345,10226254,K23,HL,5.00,N,07/22/2021,08/01/2021,07/31/2022,838.00,K23HL153584,...,NHLBI Mentored Patient-Oriented Research Revie...,,,2.00,170707.00,13504.00,184211.00,,PROJECT SUMMARY Candidate: Dr. Jonathan Casey ...,2021
81348,10280668,R01,HL,1.00,N,09/20/2021,09/20/2021,07/31/2022,837.00,R01HL159374,...,"Cancer, Heart, and Sleep Epidemiology B Study ...",,,1.00,588366.00,101009.00,689375.00,,Abstract African Americans (AAs) have a high p...,2021
81355,10218250,P01,HL,5.00,N,08/22/2021,08/01/2021,07/31/2022,,P01HL114470,...,Special Emphasis Panel,5099.00,,10.00,208600.00,101171.00,,309771.00,ABSTRACT The overall objective of the Clinica...,2021
81383,10200130,R00,HL,5.00,N,06/25/2021,07/01/2021,06/30/2022,837.00,R00HL141143,...,Special Emphasis Panel,,,4.00,163694.00,85121.00,248815.00,,PROJECT SUMMARY/ABSTRACT Obesity and type-2 di...,2021


## This sections focuses on identifying projects under the Sickle Cell Initiative

##### Not all projects have abstract text: https://reporter.nih.gov/search/1vBr_35MHU2c4LPTdlek0A/project-details/10700477

In [24]:
def sickle_cell_cat_NHLBI(NHLBI_db):
    # Extract CRISPR projects based on key words, making the search case-insensitive
    df_x = NHLBI_db[NHLBI_db["PROJECT_TERMS"].str.contains("Sickle Cell", case=False) | 
                    NHLBI_db["PROJECT_TERMS"].str.contains("gene therapy", case=False) | 
                    NHLBI_db["PROJECT_TERMS"].str.contains("Sickle Cell Anemia", case=False)]
    
    df_y = df_x[df_x["NIH_SPENDING_CATS"].str.contains("Gene Therapy", case=False) | 
                df_x["NIH_SPENDING_CATS"].str.contains("Gene Therapy Clinical Trials", case=False) | 
                df_x["NIH_SPENDING_CATS"].str.contains("Sickle Cell Disease", case=False)]
    
    return df_y

## Resulting dataframe 

In [26]:
sickle_cell_cat_NHLBI(NHLBI_db)

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,...,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT,ABSTRACT_TEXT,YEAR
129,10252928,R61,HL,5.00,N,09/01/2021,09/01/2021,08/31/2022,839.00,R61HL154254,...,Special Emphasis Panel,,,2.00,644659.00,120054.00,628041.00,,Project Summary The role of brain endothelial ...,2021
448,10311624,U01,HL,1.00,N,09/01/2021,09/01/2021,08/31/2022,839.00,U01HL159850,...,Special Emphasis Panel,,,1.00,462169.00,85837.00,548006.00,,PROJECT SUMMARY Sickle cell disease (SCD) is a...,2021
629,10146451,P01,HL,5.00,N,04/14/2021,04/01/2021,03/31/2022,,P01HL032262,...,"Heart, Lung, and Blood Initial Review Group",7751.00,,39.00,325253.00,108167.00,,433420.00,Abstract Hematopoiesis is regulated by transcr...,2021
660,10274831,UH3,HL,4.00,N,09/02/2021,09/01/2021,08/31/2022,310.00,UH3HL147366,...,Special Emphasis Panel,,,4.00,668796.00,248562.00,917358.00,,Project Summary/Abstract Recent advances make ...,2021
761,10154363,U01,HL,1.00,N,03/12/2021,03/15/2021,02/28/2022,839.00,U01HL156620,...,Special Emphasis Panel,,,1.00,411859.00,279256.00,691115.00,,Summary Our primary objective is initiating a ...,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79983,10287682,R21,HL,1.00,N,08/31/2021,09/01/2021,09/02/2021,839.00,R21HL159561,...,Gene and Drug Delivery Systems Study Section,,,1.00,1.00,0.00,1.00,,Gene therapy has been explored for cure of hem...,2021
80086,10211085,R03,HL,1.00,N,05/07/2021,05/15/2021,05/14/2022,839.00,R03HL157878,...,Special Emphasis Panel,,,1.00,50000.00,28250.00,78250.00,,Sickle Cell Disease (SCD) is the most prevalen...,2021
80253,10078619,R01,HL,5.00,N,12/18/2020,12/01/2020,11/30/2021,837.00,R01HL114541,...,Vascular Cell and Molecular Biology Study Sect...,,,7.00,463302.00,257133.00,720435.00,,PROJECT SUMMARY Atherosclerosis is a disease o...,2021
80462,10490151,ZIA,HL,1.00,N,,,,,ZIAHL006266,...,,,,1.00,,,1444860.00,,Sickle cell disease (SCD) is a multisystem dis...,2021
