<a href="https://colab.research.google.com/github/Tar-ive/grants_recsys/blob/master/raw_data_sources/nih_grants.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# attempting to get data from NIH API.

In [None]:
import requests
import pandas as pd
from datetime import datetime
import time
import json

class NIHGrantsAPI:
    def __init__(self):
        self.base_url = "https://api.reporter.nih.gov/v2/projects/search"
        self.headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }

    def get_active_grants(self, offset=0, limit=500):
        """
        Fetches active NIH grants with their opportunity numbers and details
        """
        payload = {
            "criteria": {
                "include_active_projects": True,
                "fiscal_years": [datetime.now().year],
                "exclude_subprojects": True  # Exclude subprojects for cleaner data
            },
            "offset": offset,
            "limit": limit,
            "include_fields": [
                "ProjectNum",           # For opportunity_id
                "ProjectTitle",         # For opportunity_title
                "AgencyIcAdmin",        # For agency_name
                "AwardAmount",          # For award_amount
                "ProjectStartDate",     # For post_date
                "ProjectEndDate",       # For close_date
                "AbstractText",         # For summary_description
                "OrganizationType",     # For applicant_types
                "OrganizationName",     # For organization
                "OpportunityNumber"     # Additional opportunity ID field
            ]
        }

        try:
            response = requests.post(self.base_url, json=payload, headers=self.headers)
            response.raise_for_status()
            data = response.json()

            if 'results' not in data:
                print(f"No results found in response: {data}")
                return pd.DataFrame()

            # Transform the data into the desired format
            grants = []
            for item in data['results']:
                grant = {
                    'opportunity_id': item.get('opportunity_number') or item.get('project_num'),
                    'opportunity_title': item.get('project_title'),
                    'agency': 'NIH',
                    'agency_name': item.get('agency_ic_admin'),
                    'award_amount': item.get('award_amount'),
                    'post_date': item.get('project_start_date'),
                    'close_date': item.get('project_end_date'),
                    'summary_description': item.get('abstract_text'),
                    'applicant_types': item.get('organization_type'),
                    'organization': item.get('organization_name')
                }
                grants.append(grant)

            return pd.DataFrame(grants)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data from NIH API: {e}")
            return pd.DataFrame()

    def get_all_grants(self, max_records=1000):
        """
        Fetches all grants using pagination
        """
        all_grants = []
        offset = 0
        limit = 500  # Max allowed by API

        while offset < max_records:
            print(f"Fetching grants {offset} to {offset + limit}...")

            df = self.get_active_grants(offset=offset, limit=limit)

            if df.empty:
                break

            all_grants.append(df)

            if len(df) < limit:  # Less results than limit means we've reached the end
                break

            offset += limit
            time.sleep(1)  # Respect rate limit of 1 request per second

        if all_grants:
            return pd.concat(all_grants, ignore_index=True)
        return pd.DataFrame()

    def save_to_csv(self, df, filename="nih_grants.csv"):
        """
        Saves the grants data to a CSV file
        """
        if df is not None and not df.empty:
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
        else:
            print("No data to save")

# Example usage
if __name__ == "__main__":
    api = NIHGrantsAPI()

    # Get all grants with pagination
    all_grants_df = api.get_all_grants(max_records=5000)

    if not all_grants_df.empty:
        print(f"Retrieved {len(all_grants_df)} grants")
        # Display first few rows
        print("\nFirst few rows of data:")
        print(all_grants_df.head())

        # Save to CSV
        api.save_to_csv(all_grants_df)
    else:
        print("No grants data retrieved")

Fetching grants 0 to 500...
Fetching grants 500 to 1000...
Fetching grants 1000 to 1500...
Fetching grants 1500 to 2000...
Fetching grants 2000 to 2500...
Fetching grants 2500 to 3000...
Fetching grants 3000 to 3500...
Fetching grants 3500 to 4000...
Fetching grants 4000 to 4500...
Fetching grants 4500 to 5000...
Retrieved 5000 grants

First few rows of data:
  opportunity_id                                  opportunity_title agency  \
0      PA-20-190  The mechanistic study on irisin-mediated immun...    NIH   
1      PA-20-195  Pyroptotic Macrophages Traps Against Shigella ...    NIH   
2     PAR-21-178  DDT-IST-000014: Progressing towards the Qualif...    NIH   
3      PA-21-268       Novel Treatments for Ocular Surface Diseases    NIH   
4      PA-20-272  Mobile Three-Dimensional Screening for Cranial...    NIH   

                                         agency_name  award_amount  \
0  {'code': 'DK', 'abbreviation': 'NIDDK', 'name'...      152454.0   
1  {'code': 'AI', 'abbreviati

In [None]:
grants_df = all_grants_df

In [None]:
grants_df.isna().sum()

Unnamed: 0,0
opportunity_id,0
opportunity_title,0
agency,0
agency_name,0
award_amount,95
post_date,159
close_date,159
summary_description,7
applicant_types,0
organization,5000


# now cleaning the data and generating keywords and also creating embeddings


In [None]:
grants_df

Unnamed: 0,opportunity_id,opportunity_title,agency,agency_name,award_amount,post_date,close_date,summary_description,applicant_types,organization
0,PA-20-190,The mechanistic study on irisin-mediated immun...,NIH,"{'code': 'DK', 'abbreviation': 'NIDDK', 'name'...",152454.0,2025-01-01T12:01:00Z,2027-11-30T12:11:00Z,PROJECT SUMMARY\nObesity and its associated ty...,"{'name': 'Independent Hospitals', 'code': '30'...",
1,PA-20-195,Pyroptotic Macrophages Traps Against Shigella ...,NIH,"{'code': 'AI', 'abbreviation': 'NIAID', 'name'...",191250.0,2023-08-25T12:08:00Z,2025-07-31T12:07:00Z,"Shigella spp. are major enteric pathogens, cau...","{'name': 'SCHOOLS OF MEDICINE', 'code': '10', ...",
2,PAR-21-178,DDT-IST-000014: Progressing towards the Qualif...,NIH,"{'code': 'FD', 'abbreviation': 'FDA', 'name': ...",249366.0,2024-02-20T12:02:00Z,2026-01-19T12:01:00Z,The broad long-term objective of this project ...,"{'name': 'Domestic For-Profits', 'code': 'FP',...",
3,PA-21-268,Novel Treatments for Ocular Surface Diseases,NIH,"{'code': 'EY', 'abbreviation': 'NEI', 'name': ...",388089.0,2019-04-01T12:04:00Z,2025-03-31T12:03:00Z,PROJECT DESCRIPTION/ABSTRACT\nDry eye disease ...,"{'name': 'SCHOOLS OF MEDICINE', 'code': '10', ...",
4,PA-20-272,Mobile Three-Dimensional Screening for Cranial...,NIH,"{'code': 'DE', 'abbreviation': 'NIDCR', 'name'...",50000.0,2022-09-01T12:09:00Z,2026-02-28T12:02:00Z,Modified Project Summary/Abstract Section\n\nD...,"{'name': 'Domestic For-Profits', 'code': 'FP',...",
...,...,...,...,...,...,...,...,...,...,...
4995,PAR-19-212,Improving Outcomes of Adolescents in Residenti...,NIH,"{'code': 'DA', 'abbreviation': 'NIDA', 'name':...",611907.0,2021-08-01T12:08:00Z,2026-05-31T12:05:00Z,Project Description\nAdolescents in residentia...,"{'name': 'SCHOOLS OF MEDICINE', 'code': '10', ...",
4996,PA-21-071,Improving Outcomes of Adolescents in Residenti...,NIH,"{'code': 'DA', 'abbreviation': 'NIDA', 'name':...",187360.0,2021-08-01T12:08:00Z,2026-05-31T12:05:00Z,Project Description\nAdolescents in residentia...,"{'name': 'SCHOOLS OF MEDICINE', 'code': '10', ...",
4997,PA-19-056,Longitudinal Examination of Sluggish Cognitive...,NIH,"{'code': 'MH', 'abbreviation': 'NIMH', 'name':...",636720.0,2020-12-01T12:12:00Z,2025-10-31T12:10:00Z,PROJECT SUMMARY/ABSTRACT\nSluggish cognitive t...,"{'name': 'Independent Hospitals', 'code': '30'...",
4998,PAR-20-125,Elevating NW Tribal Health Sciences: NW NARCH ...,NIH,"{'code': 'GM', 'abbreviation': 'NIGMS', 'name'...",1279506.0,2021-09-10T12:09:00Z,2025-07-31T12:07:00Z,Project Summary: Overall\nThree premises under...,"{'name': 'Other Domestic Non-Profits', 'code':...",


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
import nltk
import torch
from nltk.corpus import stopwords
from tqdm.notebook import tqdm

# Download required NLTK data
nltk.download('stopwords', quiet=True)

def clean_grants_df(grants_df):
    """
    Cleans the grants dataframe
    """
    print("Initial data shape:", grants_df.shape)
    print("\nInitial null values:")
    print(grants_df.isna().sum())

    cleaned_df = grants_df.copy()

    # Remove organization column if exists
    if 'organization' in cleaned_df.columns:
        cleaned_df = cleaned_df.drop('organization', axis=1)

    # Remove rows with missing critical values
    cleaned_df = cleaned_df.dropna(subset=[
        'post_date',
        'close_date',
        'award_amount',
        'summary_description'
    ])

    print("\nShape after cleaning:", cleaned_df.shape)
    print("\nRemaining null values:")
    print(cleaned_df.isna().sum())

    return cleaned_df

def preprocess_text(text):
    """
    Preprocesses text for keyword extraction
    """
    if pd.isna(text):
        return ""

    text = str(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)

    return text

def extract_keywords(text, model, vectorizer, n_keywords=5):
    """
    Extracts keywords from text using BERT embeddings
    """
    if not text or pd.isna(text):
        return []

    text = preprocess_text(text)

    try:
        candidates = vectorizer.get_feature_names_out()

        # Using GPU for embeddings
        with torch.cuda.device(0):
            text_embedding = model.encode([text], convert_to_tensor=True)
            candidate_embeddings = model.encode(candidates, convert_to_tensor=True)

        # Move to CPU for numpy operations
        similarities = torch.mm(candidate_embeddings, text_embedding.T).flatten().cpu().numpy()

        top_idx = similarities.argsort()[-n_keywords:][::-1]
        keywords = [candidates[i] for i in top_idx]

        return keywords
    except Exception as e:
        print(f"Error processing text: {e}")
        return []

def generate_embeddings(texts, model, batch_size=32):
    """
    Generate embeddings for a list of texts using batching
    """
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]
        with torch.cuda.device(0):
            batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
            # Move to CPU and convert to numpy
            embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(embeddings)

def process_grants_data(grants_df, model_name='all-MiniLM-L6-v2', batch_size=32):
    """
    Main function to clean data, generate keywords and embeddings
    """
    # Step 1: Clean the data
    print("Cleaning data...")
    cleaned_df = clean_grants_df(grants_df)

    # Step 2: Initialize models and vectorizer
    print("\nLoading BERT model...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = SentenceTransformer(model_name)
    model = model.to(device)

    print("Initializing vectorizer...")
    vectorizer = CountVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        max_features=1000
    )

    # Fit vectorizer on all summaries
    all_texts = cleaned_df['summary_description'].astype(str).tolist()
    vectorizer.fit(all_texts)

    # Step 3: Generate keywords
    print("\nGenerating keywords...")
    keywords_list = []

    for i in tqdm(range(0, len(cleaned_df), batch_size), desc="Extracting keywords"):
        batch = cleaned_df.iloc[i:i + batch_size]
        batch_keywords = [
            extract_keywords(text, model, vectorizer)
            for text in batch['summary_description']
        ]
        keywords_list.extend(batch_keywords)

    cleaned_df['keywords'] = keywords_list

    # Step 4: Generate embeddings
    print("\nGenerating embeddings for summaries...")
    summary_embeddings = generate_embeddings(all_texts, model, batch_size)

    # Generate embeddings for keywords (joining keywords into single string)
    print("Generating embeddings for keywords...")
    keyword_texts = [' '.join(keywords) for keywords in keywords_list]
    keyword_embeddings = generate_embeddings(keyword_texts, model, batch_size)

    # Add embeddings to dataframe
    cleaned_df['summary_embedding'] = list(summary_embeddings)
    cleaned_df['keyword_embedding'] = list(keyword_embeddings)

    print("\nProcessing complete!")
    print(f"Final shape: {cleaned_df.shape}")

    # Print some stats about embeddings
    print("\nEmbedding dimensions:")
    print(f"Summary embeddings: {summary_embeddings.shape}")
    print(f"Keyword embeddings: {keyword_embeddings.shape}")

    return cleaned_df

# Run the processing
if __name__ == "__main__":
    try:
        processed_df = process_grants_data(grants_df)

        # Save without embeddings for CSV (they're too large)
        save_df = processed_df.drop(['summary_embedding', 'keyword_embedding'], axis=1)
        save_df.to_csv('processed_grants.csv', index=False)

        # Save embeddings separately as numpy arrays
        np.save('summary_embeddings.npy', np.stack(processed_df['summary_embedding'].values))
        np.save('keyword_embeddings.npy', np.stack(processed_df['keyword_embedding'].values))

        print("\nResults saved to:")
        print("- processed_grants.csv (without embeddings)")
        print("- summary_embeddings.npy")
        print("- keyword_embeddings.npy")

    except Exception as e:
        print(f"Error processing data: {e}")

Cleaning data...
Initial data shape: (5000, 10)

Initial null values:
opportunity_id            0
opportunity_title         0
agency                    0
agency_name               0
award_amount             95
post_date               159
close_date              159
summary_description       7
applicant_types           0
organization           5000
dtype: int64

Shape after cleaning: (4739, 9)

Remaining null values:
opportunity_id         0
opportunity_title      0
agency                 0
agency_name            0
award_amount           0
post_date              0
close_date             0
summary_description    0
applicant_types        0
dtype: int64

Loading BERT model...
Using device: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Initializing vectorizer...

Generating keywords...


Extracting keywords:   0%|          | 0/149 [00:00<?, ?it/s]


Generating embeddings for summaries...


Generating embeddings:   0%|          | 0/149 [00:00<?, ?it/s]

Generating embeddings for keywords...


Generating embeddings:   0%|          | 0/149 [00:00<?, ?it/s]


Processing complete!
Final shape: (4739, 12)

Embedding dimensions:
Summary embeddings: (4739, 384)
Keyword embeddings: (4739, 384)

Results saved to:
- processed_grants.csv (without embeddings)
- summary_embeddings.npy
- keyword_embeddings.npy


# pushing embeddings into supabase tables.

In [None]:
import psycopg2
from sqlalchemy import create_engine, text
import numpy as np
from tqdm.notebook import tqdm

def create_nih_grants_table(conn):
    """
    Creates the nih_grants table in Supabase
    """
    create_table_query = """
    CREATE TABLE IF NOT EXISTS nih_grants (
        id SERIAL PRIMARY KEY,
        opportunity_id TEXT,
        opportunity_title TEXT,
        agency TEXT,
        agency_name TEXT,
        award_amount NUMERIC,
        post_date DATE,
        close_date DATE,
        summary_description TEXT,
        applicant_types TEXT,
        keywords TEXT[],
        summary_embedding vector(384),
        keyword_embedding vector(384)
    );
    """

    try:
        with conn.cursor() as cur:
            # First create vector extension if it doesn't exist
            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

            # Create the table
            cur.execute(create_table_query)

            # Create indexes for faster querying
            cur.execute("CREATE INDEX IF NOT EXISTS idx_nih_grants_opportunity_id ON nih_grants(opportunity_id);")
            cur.execute("CREATE INDEX IF NOT EXISTS idx_nih_grants_post_date ON nih_grants(post_date);")

            conn.commit()
            print("Table created successfully!")

    except Exception as e:
        print(f"Error creating table: {e}")
        conn.rollback()

def upload_data_to_supabase(processed_df, summary_embeddings, keyword_embeddings, conn):
    """
    Uploads the processed data to Supabase
    """
    try:
        with conn.cursor() as cur:
            print("Uploading data to Supabase...")

            # Prepare data for insertion
            for i in tqdm(range(len(processed_df))):
                row = processed_df.iloc[i]

                # Convert keywords list to proper PostgreSQL array format
                keywords = list(row['keywords'])  # Ensure it's a list

                # Convert embeddings to PostgreSQL vector format
                summary_emb = summary_embeddings[i].tolist()
                keyword_emb = keyword_embeddings[i].tolist()

                insert_query = """
                INSERT INTO nih_grants (
                    opportunity_id, opportunity_title, agency, agency_name,
                    award_amount, post_date, close_date, summary_description,
                    applicant_types, keywords, summary_embedding, keyword_embedding
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """

                cur.execute(insert_query, (
                    str(row['opportunity_id']),
                    str(row['opportunity_title']),
                    str(row['agency']),
                    str(row['agency_name']),
                    float(row['award_amount']),
                    row['post_date'],
                    row['close_date'],
                    str(row['summary_description']),
                    str(row['applicant_types']),
                    keywords,
                    summary_emb,
                    keyword_emb
                ))

                # Commit every 100 rows
                if (i + 1) % 100 == 0:
                    conn.commit()

            # Final commit
            conn.commit()
            print("Data uploaded successfully!")

    except Exception as e:
        print(f"Error uploading data: {e}")
        conn.rollback()

def main():
    # Connect to Supabase
    try:
        conn = psycopg2.connect(DB_URL)
        print("Connected to Supabase!")

        # Create table
        create_nih_grants_table(conn)

        # Load embeddings
        summary_embeddings = np.load('summary_embeddings.npy')
        keyword_embeddings = np.load('keyword_embeddings.npy')

        # Upload data
        upload_data_to_supabase(processed_df, summary_embeddings, keyword_embeddings, conn)

        # Close connection
        conn.close()
        print("Process completed successfully!")

    except Exception as e:
        print(f"Error connecting to database: {e}")

if __name__ == "__main__":
    main()

Error connecting to database: name 'DB_URL' is not defined


In [None]:
from google.colab import userdata  # For secrets

DB_URL = userdata.get('DATABASE_URL')

# Run the upload process
main()

Connected to Supabase!
Table created successfully!
Uploading data to Supabase...


  0%|          | 0/4739 [00:00<?, ?it/s]

Data uploaded successfully!
Process completed successfully!
