<a href="https://colab.research.google.com/github/Tar-ive/grants_recsys/blob/master/recommendation_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install psycopg2-binary pandas

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


In [2]:
import psycopg2
import pandas as pd
from google.colab import userdata  # For secrets

# Get Supabase credentials
DB_URL = userdata.get('DATABASE_URL')  # Store in Colab secrets

# Connect
conn = psycopg2.connect(DB_URL)

In [3]:
# Load grants
grants_df = pd.read_sql("SELECT * FROM grants_data", conn)

# Load researchers
researchers_df = pd.read_sql("SELECT * FROM researchers", conn)

  grants_df = pd.read_sql("SELECT * FROM grants_data", conn)
  researchers_df = pd.read_sql("SELECT * FROM researchers", conn)


In [5]:
researchers_df.isnull().sum()

Unnamed: 0,0
id,0
researcher_id,0
researcher_name,0
total_citations,0
total_works,0
h_index,0
i10_index,0
concept_1,0
concept_1_score,0
concept_2,2


In [6]:
researchers_df.dtypes

Unnamed: 0,0
id,int64
researcher_id,object
researcher_name,object
total_citations,int64
total_works,int64
h_index,int64
i10_index,int64
concept_1,object
concept_1_score,float64
concept_2,object


In [7]:
def clean_researchers(researchers_df):
    # Handle concept missingness (only 2 missing values)
    researchers_df['concept_2'] = researchers_df['concept_2'].fillna(researchers_df['concept_2'].mode()[0])
    researchers_df['concept_2_score'] = researchers_df['concept_2_score'].fillna(researchers_df['concept_2_score'].mean())

    # Handle top works (preserve structure while marking missingness)
    for i in range(2,6):
        prefix = f'top_work_{i}_'
        researchers_df[f'{prefix}id'] = researchers_df[f'{prefix}id'].fillna('')
        researchers_df[f'{prefix}type'] = researchers_df[f'{prefix}type'].fillna('unknown')
        researchers_df[f'{prefix}is_oa'] = researchers_df[f'{prefix}is_oa'].fillna(False)
        researchers_df[f'{prefix}keywords'] = researchers_df[f'{prefix}keywords'].fillna('')
        researchers_df[f'{prefix}source'] = researchers_df[f'{prefix}source'].fillna('unknown')

    # Handle collaborators - encode missingness explicitly
    for i in range(1,6):
        col = f'top_collaborator_{i}'
        researchers_df[col] = researchers_df[col].fillna('No Collaborator')

    return researchers_df

In [8]:
cleaned_researchers = clean_researchers(researchers_df)


  researchers_df[f'{prefix}is_oa'] = researchers_df[f'{prefix}is_oa'].fillna(False)


In [9]:
cleaned_researchers.isnull().sum()

Unnamed: 0,0
id,0
researcher_id,0
researcher_name,0
total_citations,0
total_works,0
h_index,0
i10_index,0
concept_1,0
concept_1_score,0
concept_2,0


In [10]:
def normalize_data(researchers_df):
    # Features needing normalization
    numeric_cols = [
        'total_citations', 'total_works', 'h_index',
        'i10_index', 'recent_works_count', 'recent_citations',
        'unique_venues', 'avg_coauthors', 'open_access_ratio'
    ]

    from sklearn.preprocessing import RobustScaler
    scaler = RobustScaler()
    researchers_df[numeric_cols] = scaler.fit_transform(researchers_df[numeric_cols])

    return researchers_df, scaler

In [11]:
def parameterize_data(researchers_df):
    # 1. Create composite features
    researchers_df['productivity_score'] = researchers_df['recent_works_count'] * researchers_df['avg_coauthors']

    # 2. Temporal decay for citations
    researchers_df['decayed_citations'] = researchers_df['total_citations'] * (0.95 ** researchers_df['years_active'])

    # 3. Combine text features for embeddings
    researchers_df['research_text'] = (
        researchers_df['concept_1'] + " " +
        researchers_df['concept_2'] + " " +
        researchers_df[[f'top_work_{i}_keywords' for i in range(1,6)]]
        .fillna('').agg(' '.join, axis=1)
    )

    return researchers_df

In [12]:
cleaned_researchers = clean_researchers(researchers_df)
normalized_researchers, scaler = normalize_data(cleaned_researchers)
parameterized_researchers = parameterize_data(normalized_researchers)

In [13]:
def create_researcher_text(row):
    # Core expertise
    concepts = f"{row['concept_1']} (score: {row['concept_1_score']}) {row['concept_2']} (score: {row['concept_2_score']})"

    # Top works keywords
    works_keywords = " ".join(
        [str(row[f"top_work_{i}_keywords"]) for i in range(1,6)
         if pd.notna(row[f"top_work_{i}_keywords"])] # Move the conditional inside the list comprehension
    )

    # Collaborators
    collaborators = " ".join(
        [str(row[f"top_collaborator_{i}"]) for i in range(1,6)
         if pd.notna(row[f"top_collaborator_{i}"])] # Move the conditional inside the list comprehension
    )

    # Venue diversity
    venues = f"Published in {row['unique_venues']} unique venues"

    return f"{concepts}. Works: {works_keywords}. Collaborators: {collaborators}. {venues}"

researchers_df["research_text"] = researchers_df.apply(create_researcher_text, axis=1)

# grants data


In [2]:
import psycopg2
import pandas as pd
from google.colab import userdata  # For secrets

# Get Supabase credentials
DB_URL = userdata.get('DATABASE_URL')  # Store in Colab secrets

# Connect
conn = psycopg2.connect(DB_URL)
grants_df = pd.read_sql("SELECT * FROM grants_data", conn)


  grants_df = pd.read_sql("SELECT * FROM grants_data", conn)


In [3]:
def clean_grants(grants_df):
    # Drop less important columns with high null counts
    grants_df = grants_df.drop(columns=['category_explanation'])

    # Handle critical missing dates - drop rows with missing close_date
    grants_df = grants_df.dropna(subset=['close_date'])

    # Fill categorical missing values
    grants_df['category'] = grants_df['category'].fillna('Other')

    # Handle eligibility_description (critical for embeddings)
    grants_df = grants_df.dropna(subset=['eligibility_description'])

    # Convert dates to datetime
    grants_df['post_date'] = pd.to_datetime(grants_df['post_date'])
    grants_df['close_date'] = pd.to_datetime(grants_df['close_date'])

    return grants_df

cleaned_grants = clean_grants(grants_df.copy())
print(f"Remaining grants: {len(cleaned_grants)}/{len(grants_df)}")

Remaining grants: 3138/5000


In [4]:
grants_df = cleaned_grants

In [5]:
grants_df["grant_text"] = grants_df["opportunity_title"] + " " + grants_df["summary_description"] + " " + grants_df["eligibility_description"]

In [6]:
grants_df.isnull().sum()

Unnamed: 0,0
opportunity_id,0
opportunity_number,0
opportunity_title,0
opportunity_status,0
agency,0
agency_code,0
agency_name,0
category,0
award_ceiling,0
award_floor,0


In [7]:
grants_df.isnull().sum()

Unnamed: 0,0
opportunity_id,0
opportunity_number,0
opportunity_title,0
opportunity_status,0
agency,0
agency_code,0
agency_name,0
category,0
award_ceiling,0
award_floor,0


In [8]:
!pip install sentence-transformers[gpu] psycopg2-binary supabase

import torch
import psycopg2
from sqlalchemy import create_engine, text
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize GPU model
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

Collecting supabase
  Downloading supabase-2.12.0-py3-none-any.whl.metadata (10 kB)
Collecting gotrue<3.0.0,>=2.11.0 (from supabase)
  Downloading gotrue-2.11.2-py3-none-any.whl.metadata (6.0 kB)
Collecting postgrest<0.20,>=0.19 (from supabase)
  Downloading postgrest-0.19.3-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime<3.0.0,>=2.0.0 (from supabase)
  Downloading realtime-2.2.0-py3-none-any.whl.metadata (6.7 kB)
Collecting storage3<0.12,>=0.10 (from supabase)
  Downloading storage3-0.11.1-py3-none-any.whl.metadata (1.8 kB)
Collecting supafunc<0.10,>=0.9 (from supabase)
  Downloading supafunc-0.9.2-py3-none-any.whl.metadata (1.2 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from postgrest<0.20,>=0.19->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting websockets<14,>=11 (from realtime<3.0.0,>=2.0.0->supabase)
  Downloading websockets-13.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metada

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# researchers embeddings


In [21]:
DB_URL = userdata.get('DATABASE_URL')
engine = create_engine(DB_URL)

# --- Generate Researcher Embeddings ---
def create_researcher_text(row):
    concepts = f"{row['concept_1']} ({row['concept_1_score']}) {row['concept_2']} ({row['concept_2_score']})"
    works = ' '.join([str(row[f'top_work_{i}_keywords']) for i in range(1,6)])
    collabs = ' '.join([str(row[f'top_collaborator_{i}']) for i in range(1,6)])
    return f"{concepts}. Works: {works}. Collaborators: {collabs}."

with engine.connect() as conn:
    researchers = pd.read_sql("SELECT * FROM researchers", conn)
    researchers['text'] = researchers.apply(create_researcher_text, axis=1)

    # Batch processing with GPU
    researcher_embeddings = model.encode(
        researchers['text'].tolist(),
        batch_size=128,
        device='cuda',
        convert_to_numpy=True
    )

In [22]:
researchers_df

Unnamed: 0,id,researcher_id,researcher_name,total_citations,total_works,h_index,i10_index,concept_1,concept_1_score,concept_2,...,top_work_5_keywords,top_work_5_source,top_collaborator_1,top_collaborator_2,top_collaborator_3,top_collaborator_4,top_collaborator_5,productivity_score,decayed_citations,research_text
0,1,https://openalex.org/A5046299069,Larry R. Price,64.401231,6.00000,6.714286,11.555556,Computer science,73.1,Mathematics,...,"Psychology,Exposure therapy,Virtual reality,Ps...",Journal of Consulting and Clinical Psychology,https://openalex.org/A5019375929,https://openalex.org/A5009004096,https://openalex.org/A5101870896,https://openalex.org/A5085670077,https://openalex.org/A5107483970,2.214036,2.966952,Computer science (score: 73.1) Mathematics (sc...
1,2,https://openalex.org/A5039371296,Michael A. Huston,52.467692,2.40625,5.428571,6.333333,Biology,93.5,Ecology,...,"Microsite,Facilitation,Competition (biology),C...",Ecology,https://openalex.org/A5024934767,https://openalex.org/A5110483019,https://openalex.org/A5038398105,https://openalex.org/A5008653559,https://openalex.org/A5103521151,0.000000,6.085320,Biology (score: 93.5) Ecology (score: 86.0). W...
2,3,https://openalex.org/A5070163403,Marcus Felson,52.002462,4.53125,5.000000,7.333333,Sociology,72.7,Psychology,...,"Consumption (sociology),Sociology,Computer sci...",American Behavioral Scientist,https://openalex.org/A5046958460,https://openalex.org/A5102936013,https://openalex.org/A5037722191,https://openalex.org/A5109563801,https://openalex.org/A5072647442,-1.236093,3.096151,Sociology (score: 72.7) Psychology (score: 65....
3,4,https://openalex.org/A5017593645,Togay Ozbakkaloglu,46.363077,10.31250,10.142857,22.888889,Materials science,95.7,Composite material,...,"Materials science,Shrinkage,Flexural strength,...",Journal of Materials Science,https://openalex.org/A5082132566,https://openalex.org/A5043329482,https://openalex.org/A5040407357,https://openalex.org/A5011064916,https://openalex.org/A5019907987,18.327822,15.789485,Materials science (score: 95.7) Composite mate...
4,5,https://openalex.org/A5048215687,Eric Kirby,30.225231,8.21875,6.285714,9.333333,Geology,92.8,Biology,...,"Thermochronology,Geology,Denudation,Fission tr...",Tectonics,https://openalex.org/A5038349665,https://openalex.org/A5090086069,https://openalex.org/A5100685756,https://openalex.org/A5038670501,https://openalex.org/A5009378840,5.087122,5.284114,Geology (score: 92.8) Biology (score: 87.5). W...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,https://openalex.org/A5086995405,Theron Stimmel,-0.307692,-0.31250,-0.285714,-0.222222,Psychology,66.7,Computer science,...,Computer science,,https://openalex.org/A5084095035,https://openalex.org/A5044359247,https://openalex.org/A5044085867,https://openalex.org/A5111670397,https://openalex.org/A5006419471,0.060203,-0.081083,Psychology (score: 66.7) Computer science (sco...
1996,1997,https://openalex.org/A5036852314,Charles F. Bridges,-0.307692,-0.34375,-0.428571,-0.222222,Psychology,100.0,Medicine,...,"Psychopathology,Mood,Psychology,Cognition,Perc...",,https://openalex.org/A5015451611,https://openalex.org/A5057970321,No Collaborator,No Collaborator,No Collaborator,0.077483,-0.066043,Psychology (score: 100.0) Medicine (score: 100...
1997,1998,https://openalex.org/A5048816917,Brooke Pfeiffer,-0.307692,-0.37500,-0.428571,-0.333333,Psychology,75.0,Political science,...,,unknown,https://openalex.org/A5112296295,https://openalex.org/A5039882388,https://openalex.org/A5022399715,https://openalex.org/A5025081768,https://openalex.org/A5047735008,-0.054992,-0.250617,Psychology (score: 75.0) Political science (sc...
1998,1999,https://openalex.org/A5091269429,Paula Jones,-0.307692,-0.37500,-0.571429,-0.333333,Economic geography,100.0,Economics,...,,unknown,https://openalex.org/A5110025403,https://openalex.org/A5026433800,https://openalex.org/A5003097760,https://openalex.org/A5006771536,https://openalex.org/A5091856906,0.074603,-0.053792,Economic geography (score: 100.0) Economics (s...


In [23]:
!pip install huggingface_hub



In [24]:
import torch
from sentence_transformers import SentenceTransformer
from huggingface_hub import HfApi, create_repo, notebook_login
import pandas as pd
from sqlalchemy import create_engine, text
import numpy as np
from tqdm import tqdm

# Enable GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [10]:
grants_df

Unnamed: 0,opportunity_id,opportunity_number,opportunity_title,opportunity_status,agency,agency_code,agency_name,category,award_ceiling,award_floor,estimated_total_funding,expected_awards,post_date,close_date,funding_categories,applicant_types,funding_instruments,summary_description,eligibility_description,grant_text
0,15936,07HQPA0028,"Cooperative Ecosystem Studies Unit, Rocky Moun...",archived,DOI-USGS1,DOI-USGS1,Geological Survey,discretionary,24957.0,24957.0,24957.0,1.0,2007-07-09,2007-07-19,science_technology_and_other_research_and_deve...,other,cooperative_agreement,The U.S. Geological Surveys is offering a coo...,This financial assistance opportunity is being...,"Cooperative Ecosystem Studies Unit, Rocky Moun..."
1,15937,ED-GRANTS-070907-001,Youth violence and related issues in persisten...,archived,ED,ED,Department of Education,discretionary,0.0,0.0,8594000.0,13.0,2007-07-09,2007-08-08,education,other,grant,Note: Each funding opportunity description i...,Eligible Applicants: LEAs in which at least on...,Youth violence and related issues in persisten...
2,15939,ED-GRANTS-070907-002,State data collection; technical assistance CF...,archived,ED,ED,Department of Education,discretionary,400000.0,0.0,13500000.0,0.0,2007-07-09,2007-08-23,education,other,grant,Note: Each funding opportunity description i...,Note: Eligible entities must submit separate a...,State data collection; technical assistance CF...
3,15941,NNH07ZEA001N-EDL1,"ROA 2007: A.6 Entry, Descent, and Landing 1",archived,NASA-HQ,NASA-HQ,NASA Headquarters,discretionary,0.0,0.0,0.0,0.0,2007-07-09,2007-08-20,science_technology_and_other_research_and_deve...,unrestricted,"cooperative_agreement,grant",The National Aeronautics and Space Administrat...,Unrestricted,"ROA 2007: A.6 Entry, Descent, and Landing 1 Th..."
4,17675,CNCS-GRANTS-071307-001,Volunteer Management,archived,CNCS,CNCS,Corporation for National and Community Service,discretionary,200000.0,50000.0,800000.0,6.0,2007-07-13,2007-09-06,"disaster_prevention_and_relief,regional_develo...","county_governments,nonprofits_non_higher_educa...",grant,The Corporation for National and Community Ser...,The Corporation wants to ensure that all eligi...,Volunteer Management The Corporation for Natio...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4982,48923,DHS-09-ST-062-001,DHS Scientific Leadership Bridge Awards for Mi...,archived,DHS-OPO,DHS-OPO,Office of Procurement Operations - Grants Divi...,discretionary,250000.0,0.0,1000000.0,10.0,2009-08-05,2009-11-02,education,other,grant,The Department of Homeland Security (DHS) Scie...,Minority Serving Community Colleges - Minority...,DHS Scientific Leadership Bridge Awards for Mi...
4986,48928,A6310090048,Isle Royale Institue Support,archived,DOI-NPS,DOI-NPS,National Park Service,discretionary,0.0,0.0,125000.0,1.0,2009-08-05,2009-08-07,other,other,cooperative_agreement,Strategic partnership between Isle Royale Nati...,This is a single source award to Michigan Tech...,Isle Royale Institue Support Strategic partner...
4988,48935,PAR-09-245,Initiative to Maximize Research Education in G...,archived,HHS-NIH11,HHS-NIH11,National Institutes of Health,discretionary,50000.0,0.0,0.0,0.0,2009-08-06,2012-09-25,health,"other,federally_recognized_native_american_tri...",grant,Purpose. This funding opportunity supports two...,Other Eligible Applicants include the followin...,Initiative to Maximize Research Education in G...
4989,48936,2009-CIP-20,National Academy of Sciences Printed Electroni...,archived,DOC-NIST,DOC-NIST,National Institute of Standards and Technology,earmark,0.0,0.0,475000.0,1.0,2009-08-06,2009-09-06,science_technology_and_other_research_and_deve...,other,grant,This funding opportunity is not open to compet...,National Academy of Sciences Printed Electroni...,National Academy of Sciences Printed Electroni...


In [33]:
model = SentenceTransformer('all-mpnet-base-v2').to(device)

# Generate grant embeddings
def generate_grant_embeddings(texts, batch_size=256):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing grants"):
        batch = texts[i:i+batch_size]
        with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=torch.float16):

            emb = model.encode(batch, convert_to_tensor=True, device=device)
            embeddings.append(emb.cpu().numpy())
    return np.concatenate(embeddings)

grant_texts = grants_df['grant_text'].tolist()
grant_embeddings = generate_grant_embeddings(grant_texts)

# Add to DataFrame
grants_df['embedding'] = [emb.tolist() for emb in grant_embeddings]

Processing grants: 100%|██████████| 13/13 [00:04<00:00,  2.98it/s]


In [29]:
# Run this in Colab to see grants table columns
with engine.connect() as conn:
    result = conn.execute(text("""
        SELECT column_name
        FROM information_schema.columns
        WHERE table_name = 'grants'
    """))
    print("Grants table columns:", [row[0] for row in result])

Grants table columns: ['portal_id', 'grant_id', 'status', 'last_updated', 'change_notes', 'opportunity_title', 'opportunity_number', 'agency', 'award_amount']


In [30]:
# If column is "grant_id":
update_db_embeddings('grants', grants_df, 'grant_id')


Updating grants:   0%|          | 0/3138 [00:00<?, ?it/s]


KeyError: 'grant_id'

In [34]:
# First verify grants_df structure
print("Grants DF columns:", grants_df.columns.tolist())

# If missing grant_id, rename from opportunity_id
if 'opportunity_id' in grants_df.columns and 'grant_id' not in grants_df.columns:
    grants_df = grants_df.rename(columns={'opportunity_id': 'grant_id'})

# Generate embeddings (same as before)
grant_texts = grants_df['opportunity_title'] + " " + grants_df['change_notes']  # Adjust text generation as needed
grant_embeddings = generate_grant_embeddings(grant_texts.tolist())

# Update database
def update_grants():
    with engine.connect() as conn:
        # First add embedding column if missing
        conn.execute(text("""
            ALTER TABLE cleaned_data
            ADD COLUMN IF NOT EXISTS embedding vector(768)
        """))

        # Batch update
        for idx, row in tqdm(grants_df.iterrows(), total=len(grants_df)):
            conn.execute(
                text("""
                    UPDATE cleaned_data
                    SET embedding = :emb
                    WHERE grant_id = :id
                """),
                {"emb": grant_embeddings[idx].tolist(), "id": row['grant_id']}
            )
        conn.commit()

update_grants()

Grants DF columns: ['opportunity_id', 'opportunity_number', 'opportunity_title', 'opportunity_status', 'agency', 'agency_code', 'agency_name', 'category', 'award_ceiling', 'award_floor', 'estimated_total_funding', 'expected_awards', 'post_date', 'close_date', 'funding_categories', 'applicant_types', 'funding_instruments', 'summary_description', 'eligibility_description', 'grant_text', 'embedding']


KeyError: 'change_notes'

In [43]:
grants_df

Unnamed: 0,portal_id,grant_id,status,last_updated,change_notes,opportunity_title,opportunity_number,agency,award_amount,full_text,embedding
0,PID-001,G-001,active,2025-01-22 22:30:53.364705,Initial creation,Grant 1,OP-1,EPA,50000,Grant 1 Initial creation 50000,"[0.0012575762812048197, 0.015082490630447865, ..."
1,PID-002,G-002,closed,2025-01-22 22:30:53.364750,Updated,Grant 2,OP-2,NASA,75000,Grant 2 Updated 75000,"[-0.01558194775134325, 0.0303270872682333, -0...."


In [35]:
import os
from huggingface_hub import notebook_login
from google.colab import userdata


# Method 1: Environment variable
HF_TOKEN= userdata.get('HF_TOKEN')


# Method 2: Notebook login (recommended)
notebook_login()  # Will prompt for token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [55]:
grants_df.columns

Index(['opportunity_id', 'opportunity_number', 'opportunity_title',
       'opportunity_status', 'agency', 'agency_code', 'agency_name',
       'category', 'award_ceiling', 'award_floor', 'estimated_total_funding',
       'expected_awards', 'post_date', 'close_date', 'funding_categories',
       'applicant_types', 'funding_instruments', 'summary_description',
       'eligibility_description', 'grant_text'],
      dtype='object')

In [60]:
import pandas as pd
import torch
from sqlalchemy import create_engine, text
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from huggingface_hub import HfApi, create_repo
import os

# Initialize configuration
MODEL_REPO = "Tarive/findandfund"
DATABASE_URL = userdata.get('DATABASE_URL')
HF_TOKEN = userdata.get('HF_TOKEN')
engine = create_engine(DATABASE_URL)

# Step 1: Create Cleaned Grants Table
with engine.connect() as conn:
    conn.execute(text("DROP TABLE IF EXISTS cleaned_grants"))
    conn.execute(text("""
        CREATE TABLE cleaned_grants (
            opportunity_id TEXT PRIMARY KEY,
            opportunity_title TEXT,
            agency TEXT,
            agency_name TEXT,
            award_ceiling NUMERIC,
            award_floor NUMERIC,
            estimated_total_funding NUMERIC,
            post_date DATE,
            close_date DATE,
            funding_categories TEXT,
            applicant_types TEXT,
            funding_instruments TEXT,
            summary_description TEXT,
            eligibility_description TEXT,
            embedding vector(768)
        )
    """))
    conn.commit()

# Step 2: Generate Embeddings
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
grants_df['full_text'] = (
    grants_df['opportunity_title'].fillna('') + " | " +
    grants_df['agency_name'].fillna('') + " | " +
    grants_df['summary_description'].fillna('') + " | " +
    grants_df['eligibility_description'].fillna('')
).str.strip()

batch_size = 128
embeddings = []

for i in tqdm(range(0, len(grants_df), batch_size), desc="Generating embeddings"):
    batch = grants_df['full_text'].iloc[i:i+batch_size].tolist()
    batch_emb = model.encode(
        batch,
        batch_size=batch_size,
        device='cuda',
        show_progress_bar=False
    )
    embeddings.extend(batch_emb.tolist())

# Step 3: Insert Data into Database
with engine.connect() as conn:
    data = []
    for idx, row in tqdm(grants_df.iterrows(), total=len(grants_df), desc="Preparing data"):
        if idx >= len(embeddings):
            print(f"Skipping row {idx} - no embedding generated")
            continue

        data.append({
            "id": row['opportunity_id'],
            "title": row['opportunity_title'],
            "agency": row['agency'],
            "agency_name": row['agency_name'],
            "ceiling": row['award_ceiling'],
            "floor": row['award_floor'],
            "funding": row['estimated_total_funding'],
            "post_date": pd.to_datetime(row['post_date']),
            "close_date": pd.to_datetime(row['close_date']),
            "categories": row['funding_categories'],
            "types": row['applicant_types'],
            "instruments": row['funding_instruments'],
            "summary": row['summary_description'],
            "eligibility": row['eligibility_description'],
            "embedding": [float(x) for x in embeddings[idx]]
        })

    # Batch insert with chunks
    chunk_size = 1000
    for i in range(0, len(data), chunk_size):
        conn.execute(
            text("""
                INSERT INTO cleaned_grants VALUES (
                    :id, :title, :agency, :agency_name, :ceiling, :floor, :funding,
                    :post_date, :close_date, :categories, :types, :instruments,
                    :summary, :eligibility, :embedding
                )
            """),
            data[i:i+chunk_size]
        )
        conn.commit()

print("✅ Grants embeddings stored in database")

# Step 4: Save Model to Hugging Face Hub
api = HfApi(token=HF_TOKEN)
create_repo(
    repo_id=MODEL_REPO,
    repo_type="model",
    exist_ok=True
)

model.save_pretrained("grants-embedding-model")

api.upload_folder(
    repo_id=MODEL_REPO,
    folder_path="grants-embedding-model",
    commit_message="Initial commit with grants embedding model"
)

print("✅ Model uploaded to Hugging Face Hub")

Generating embeddings: 100%|██████████| 25/25 [00:16<00:00,  1.50it/s]
Preparing data:  74%|███████▍  | 2332/3138 [00:00<00:00, 6901.31it/s]

Skipping row 3148 - no embedding generated
Skipping row 3152 - no embedding generated
Skipping row 3153 - no embedding generated
Skipping row 3155 - no embedding generated
Skipping row 3157 - no embedding generated
Skipping row 3158 - no embedding generated
Skipping row 3160 - no embedding generated
Skipping row 3161 - no embedding generated
Skipping row 3164 - no embedding generated
Skipping row 3166 - no embedding generated
Skipping row 3167 - no embedding generated
Skipping row 3168 - no embedding generated
Skipping row 3170 - no embedding generated
Skipping row 3171 - no embedding generated
Skipping row 3172 - no embedding generated
Skipping row 3174 - no embedding generated
Skipping row 3175 - no embedding generated
Skipping row 3176 - no embedding generated
Skipping row 3183 - no embedding generated
Skipping row 3184 - no embedding generated
Skipping row 3186 - no embedding generated
Skipping row 3189 - no embedding generated
Skipping row 3190 - no embedding generated
Skipping ro

Preparing data: 100%|██████████| 3138/3138 [00:00<00:00, 5067.99it/s]


Skipping row 4096 - no embedding generated
Skipping row 4097 - no embedding generated
Skipping row 4098 - no embedding generated
Skipping row 4099 - no embedding generated
Skipping row 4100 - no embedding generated
Skipping row 4103 - no embedding generated
Skipping row 4104 - no embedding generated
Skipping row 4105 - no embedding generated
Skipping row 4109 - no embedding generated
Skipping row 4111 - no embedding generated
Skipping row 4112 - no embedding generated
Skipping row 4113 - no embedding generated
Skipping row 4114 - no embedding generated
Skipping row 4115 - no embedding generated
Skipping row 4116 - no embedding generated
Skipping row 4117 - no embedding generated
Skipping row 4119 - no embedding generated
Skipping row 4120 - no embedding generated
Skipping row 4122 - no embedding generated
Skipping row 4124 - no embedding generated
Skipping row 4125 - no embedding generated
Skipping row 4131 - no embedding generated
Skipping row 4132 - no embedding generated
Skipping r




✅ Grants embeddings stored in database


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

✅ Model uploaded to Hugging Face Hub


# generating embeddings safely for grants


In [10]:
# Generate full_text
grants_df['full_text'] = (
    grants_df['opportunity_title'].fillna('') + " | " +
    grants_df['agency_name'].fillna('') + " | " +
    grants_df['summary_description'].fillna('') + " | " +
    grants_df['eligibility_description'].fillna('')
).str.strip()

# Filter empty texts BEFORE generating embeddings
valid_mask = grants_df['full_text'].str.len() > 10  # At least 10 characters
valid_grants = grants_df[valid_mask].copy()
invalid_grants = grants_df[~valid_mask].copy()

print(f"Valid grants: {len(valid_grants)}, Invalid: {len(invalid_grants)}")

Valid grants: 3138, Invalid: 0


In [11]:
import pandas as pd
import torch
from sqlalchemy import create_engine, text
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from huggingface_hub import HfApi, create_repo
import os

# Initialize configuration
MODEL_REPO = "Tarive/findandfund"
DATABASE_URL = userdata.get('DATABASE_URL')
HF_TOKEN = userdata.get('HF_TOKEN')
engine = create_engine(DATABASE_URL)

embeddings = []
batch_size = 128

for i in tqdm(range(0, len(valid_grants), batch_size), desc="Generating embeddings"):
    batch = valid_grants['full_text'].iloc[i:i+batch_size].tolist()
    try:
        with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            batch_emb = model.encode(
                batch,
                batch_size=batch_size,
                device='cuda',
                convert_to_tensor=True
            )
            embeddings.extend(batch_emb.cpu().numpy().tolist())
    except Exception as e:
        print(f"Failed batch {i//batch_size}: {str(e)}")
        embeddings.extend([None] * len(batch))  # Mark failed entries

Generating embeddings: 100%|██████████| 25/25 [00:21<00:00,  1.18it/s]


In [12]:
# Add embeddings only to valid grants
valid_grants = valid_grants.reset_index(drop=True)
valid_grants['embedding'] = embeddings

# Combine valid/invalid grants (invalid will have NULL embeddings)
final_df = pd.concat([valid_grants, invalid_grants], ignore_index=True)
final_df = final_df.sort_index()  # Preserve original order

In [13]:
final_df

Unnamed: 0,opportunity_id,opportunity_number,opportunity_title,opportunity_status,agency,agency_code,agency_name,category,award_ceiling,award_floor,...,post_date,close_date,funding_categories,applicant_types,funding_instruments,summary_description,eligibility_description,grant_text,full_text,embedding
0,15936,07HQPA0028,"Cooperative Ecosystem Studies Unit, Rocky Moun...",archived,DOI-USGS1,DOI-USGS1,Geological Survey,discretionary,24957.0,24957.0,...,2007-07-09,2007-07-19,science_technology_and_other_research_and_deve...,other,cooperative_agreement,The U.S. Geological Surveys is offering a coo...,This financial assistance opportunity is being...,"Cooperative Ecosystem Studies Unit, Rocky Moun...","Cooperative Ecosystem Studies Unit, Rocky Moun...","[-0.009195965714752674, 0.045355189591646194, ..."
1,15937,ED-GRANTS-070907-001,Youth violence and related issues in persisten...,archived,ED,ED,Department of Education,discretionary,0.0,0.0,...,2007-07-09,2007-08-08,education,other,grant,Note: Each funding opportunity description i...,Eligible Applicants: LEAs in which at least on...,Youth violence and related issues in persisten...,Youth violence and related issues in persisten...,"[0.029821811243891716, 0.006594268139451742, 0..."
2,15939,ED-GRANTS-070907-002,State data collection; technical assistance CF...,archived,ED,ED,Department of Education,discretionary,400000.0,0.0,...,2007-07-09,2007-08-23,education,other,grant,Note: Each funding opportunity description i...,Note: Eligible entities must submit separate a...,State data collection; technical assistance CF...,State data collection; technical assistance CF...,"[-0.019351482391357422, 0.02426753006875515, -..."
3,15941,NNH07ZEA001N-EDL1,"ROA 2007: A.6 Entry, Descent, and Landing 1",archived,NASA-HQ,NASA-HQ,NASA Headquarters,discretionary,0.0,0.0,...,2007-07-09,2007-08-20,science_technology_and_other_research_and_deve...,unrestricted,"cooperative_agreement,grant",The National Aeronautics and Space Administrat...,Unrestricted,"ROA 2007: A.6 Entry, Descent, and Landing 1 Th...","ROA 2007: A.6 Entry, Descent, and Landing 1 | ...","[0.036517173051834106, -0.020543977618217468, ..."
4,17675,CNCS-GRANTS-071307-001,Volunteer Management,archived,CNCS,CNCS,Corporation for National and Community Service,discretionary,200000.0,50000.0,...,2007-07-13,2007-09-06,"disaster_prevention_and_relief,regional_develo...","county_governments,nonprofits_non_higher_educa...",grant,The Corporation for National and Community Ser...,The Corporation wants to ensure that all eligi...,Volunteer Management The Corporation for Natio...,Volunteer Management | Corporation for Nationa...,"[0.03755396232008934, 0.035538941621780396, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3133,48923,DHS-09-ST-062-001,DHS Scientific Leadership Bridge Awards for Mi...,archived,DHS-OPO,DHS-OPO,Office of Procurement Operations - Grants Divi...,discretionary,250000.0,0.0,...,2009-08-05,2009-11-02,education,other,grant,The Department of Homeland Security (DHS) Scie...,Minority Serving Community Colleges - Minority...,DHS Scientific Leadership Bridge Awards for Mi...,DHS Scientific Leadership Bridge Awards for Mi...,"[-0.012140106409788132, 0.04871319234371185, 0..."
3134,48928,A6310090048,Isle Royale Institue Support,archived,DOI-NPS,DOI-NPS,National Park Service,discretionary,0.0,0.0,...,2009-08-05,2009-08-07,other,other,cooperative_agreement,Strategic partnership between Isle Royale Nati...,This is a single source award to Michigan Tech...,Isle Royale Institue Support Strategic partner...,Isle Royale Institue Support | National Park S...,"[0.007246552035212517, 0.0414390005171299, -0...."
3135,48935,PAR-09-245,Initiative to Maximize Research Education in G...,archived,HHS-NIH11,HHS-NIH11,National Institutes of Health,discretionary,50000.0,0.0,...,2009-08-06,2012-09-25,health,"other,federally_recognized_native_american_tri...",grant,Purpose. This funding opportunity supports two...,Other Eligible Applicants include the followin...,Initiative to Maximize Research Education in G...,Initiative to Maximize Research Education in G...,"[-0.008407034911215305, 0.04766222462058067, -..."
3136,48936,2009-CIP-20,National Academy of Sciences Printed Electroni...,archived,DOC-NIST,DOC-NIST,National Institute of Standards and Technology,earmark,0.0,0.0,...,2009-08-06,2009-09-06,science_technology_and_other_research_and_deve...,other,grant,This funding opportunity is not open to compet...,National Academy of Sciences Printed Electroni...,National Academy of Sciences Printed Electroni...,National Academy of Sciences Printed Electroni...,"[-0.004488837439566851, -0.109736368060112, -0..."


In [14]:
# Check for missing required fields
required_fields = [
    'opportunity_id', 'opportunity_title', 'agency', 'agency_name',
    'award_ceiling', 'award_floor', 'estimated_total_funding',
    'post_date', 'close_date', 'funding_categories', 'applicant_types',
    'funding_instruments', 'summary_description', 'eligibility_description'
]

# Filter rows with missing required fields
valid_rows = []
for idx, row in valid_grants.iterrows():
    if all(row[field] is not None for field in required_fields):
        valid_rows.append({
            "id": row['opportunity_id'],
            "title": row['opportunity_title'],
            "agency": row['agency'],
            "agency_name": row['agency_name'],
            "ceiling": row['award_ceiling'],
            "floor": row['award_floor'],
            "funding": row['estimated_total_funding'],
            "post_date": row['post_date'],
            "close_date": row['close_date'],
            "categories": row['funding_categories'],
            "types": row['applicant_types'],
            "instruments": row['funding_instruments'],
            "summary": row['summary_description'],
            "eligibility": row['eligibility_description'],
            "embedding": row['embedding']
        })
    else:
        print(f"Skipping row {idx} - missing required fields")

print(f"Valid rows: {len(valid_rows)}")

Valid rows: 3138


In [17]:
import pandas as pd
import torch
from sqlalchemy import create_engine, text
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os

# Configuration
MODEL_REPO = "Tarive/findandfund"
DATABASE_URL = userdata.get('DATABASE_URL')
engine = create_engine(DATABASE_URL)

# Generate embeddings
embeddings = []
batch_size = 128

for i in tqdm(range(0, len(valid_grants), batch_size), desc="Generating embeddings"):
    batch = valid_grants['full_text'].iloc[i:i+batch_size].tolist()
    try:
        with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            batch_emb = model.encode(
                batch,
                batch_size=batch_size,
                device='cuda',
                convert_to_tensor=True
            )
            embeddings.extend(batch_emb.cpu().numpy().tolist())
    except Exception as e:
        print(f"Failed batch {i//batch_size}: {str(e)}")
        embeddings.extend([None] * len(batch))

# Insert data in smaller chunks
chunk_size = 100  # Reduced from 1000
with engine.connect() as conn:
    for i in tqdm(range(0, len(valid_grants), chunk_size), desc="Inserting data"):
        chunk_data = []
        for j in range(i, min(i + chunk_size, len(valid_grants))):
            if j >= len(embeddings) or embeddings[j] is None:
                continue

            chunk_data.append({
                "id": valid_grants.iloc[j]['opportunity_id'],
                "title": valid_grants.iloc[j]['opportunity_title'],
                "agency": valid_grants.iloc[j]['agency'],
                "agency_name": valid_grants.iloc[j]['agency_name'],
                "ceiling": valid_grants.iloc[j]['award_ceiling'],
                "floor": valid_grants.iloc[j]['award_floor'],
                "funding": valid_grants.iloc[j]['estimated_total_funding'],
                "post_date": pd.to_datetime(valid_grants.iloc[j]['post_date']),
                "close_date": pd.to_datetime(valid_grants.iloc[j]['close_date']),
                "categories": valid_grants.iloc[j]['funding_categories'],
                "types": valid_grants.iloc[j]['applicant_types'],
                "instruments": valid_grants.iloc[j]['funding_instruments'],
                "summary": valid_grants.iloc[j]['summary_description'],
                "eligibility": valid_grants.iloc[j]['eligibility_description'],
                "embedding": [float(x) for x in embeddings[j]]
            })

        if chunk_data:
            try:
                conn.execute(
                    text("""
                        INSERT INTO cleaned_grants AS t (
                            opportunity_id, opportunity_title, agency, agency_name,
                            award_ceiling, award_floor, estimated_total_funding,
                            post_date, close_date, funding_categories, applicant_types,
                            funding_instruments, summary_description, eligibility_description, embedding
                        ) VALUES (
                            :id, :title, :agency, :agency_name, :ceiling, :floor, :funding,
                            :post_date, :close_date, :categories, :types, :instruments,
                            :summary, :eligibility, :embedding
                        )
                        ON CONFLICT (opportunity_id) DO UPDATE SET
                            opportunity_title = EXCLUDED.opportunity_title,
                            agency = EXCLUDED.agency,
                            agency_name = EXCLUDED.agency_name
                        WHERE t.opportunity_id = EXCLUDED.opportunity_id
                    """),
                    chunk_data
                )
                conn.commit()
            except Exception as e:
                print(f"Error in chunk {i//chunk_size}: {str(e)}")
                continue

Generating embeddings: 100%|██████████| 25/25 [00:20<00:00,  1.22it/s]
Inserting data: 100%|██████████| 32/32 [09:50<00:00, 18.46s/it]


# Testing Embeddings.

In [18]:
from sentence_transformers import SentenceTransformer, util

# Load models
models = {
    "all-mpnet-base-v2": SentenceTransformer('all-mpnet-base-v2'),
    "all-MiniLM-L6-v2": SentenceTransformer('all-MiniLM-L6-v2'),
    "multi-qa-mpnet-base-dot-v1": SentenceTransformer('multi-qa-mpnet-base-dot-v1')
}

# Test on sample grants
sample_texts = [
    "Funding for AI research in healthcare",
    "Grants for renewable energy projects",
    "Support for early-stage startups"
]

# Compare embeddings
for model_name, model in models.items():
    print(f"\nTesting {model_name}:")
    embeddings = model.encode(sample_texts)
    similarities = util.cos_sim(embeddings, embeddings)
    print("Similarity matrix:\n", similarities)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Testing all-mpnet-base-v2:
Similarity matrix:
 tensor([[1.0000, 0.3193, 0.2633],
        [0.3193, 1.0000, 0.2865],
        [0.2633, 0.2865, 1.0000]])

Testing all-MiniLM-L6-v2:
Similarity matrix:
 tensor([[1.0000, 0.2580, 0.1581],
        [0.2580, 1.0000, 0.1243],
        [0.1581, 0.1243, 1.0000]])

Testing multi-qa-mpnet-base-dot-v1:
Similarity matrix:
 tensor([[1.0000, 0.5120, 0.4102],
        [0.5120, 1.0000, 0.3912],
        [0.4102, 0.3912, 1.0000]])


# can be used by combining embeddings with metadata to get better results.

In [19]:
def hybrid_score(grant, researcher_embedding):
    semantic_sim = cosine_similarity(researcher_embedding, grant['embedding'])
    funding_score = grant['award_ceiling'] / 1_000_000  # Normalize
    deadline_score = 1 / (1 + grant['days_until_deadline'])
    return 0.7 * semantic_sim + 0.2 * funding_score + 0.1 * deadline_score