<a href="https://colab.research.google.com/github/Tar-ive/find-fund/blob/main/find%26fund_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install hopsworks

Collecting hopsworks
  Downloading hopsworks-4.1.4-py3-none-any.whl.metadata (11 kB)
Collecting pyhumps==1.6.1 (from hopsworks)
  Downloading pyhumps-1.6.1-py3-none-any.whl.metadata (3.7 kB)
Collecting furl (from hopsworks)
  Downloading furl-2.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting boto3 (from hopsworks)
  Downloading boto3-1.35.92-py3-none-any.whl.metadata (6.7 kB)
Collecting pandas<2.2.0 (from hopsworks)
  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pyjks (from hopsworks)
  Downloading pyjks-20.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting mock (from hopsworks)
  Downloading mock-5.1.0-py3-none-any.whl.metadata (3.0 kB)
Collecting avro==1.11.3 (from hopsworks)
  Downloading avro-1.11.3.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.6/90.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting require

## Collect Texas State University researchers data using OpenAlex API.

In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from datetime import datetime
from collections import Counter

In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from datetime import datetime
from collections import Counter

def call_openalex_api(endpoint, params=None):
    """Make API calls with rate limiting and error handling"""
    base_url = f"https://api.openalex.org/{endpoint}"
    headers = {'User-Agent': 'mailto:your_email@example.com'}  # Replace with your email

    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        time.sleep(0.2)  # Rate limiting
        return response.json()
    except Exception as e:
        print(f"Error calling {endpoint} API: {str(e)}")
        return None

def get_texas_state_id():
    """Get Texas State University's OpenAlex ID."""
    params = {
        'filter': 'display_name.search:texas state university',
        'per-page': 1
    }
    response = call_openalex_api('institutions', params)
    if response and 'results' in response and response['results']:
        return response['results'][0]['id']
    return None

def fetch_top_cited_researchers(institution_id, max_researchers=500):
    """Fetch the top cited researchers affiliated with an institution."""
    all_researchers = []
    cursor = '*'  # Initial cursor

    while cursor and len(all_researchers) < max_researchers:
        try:
            params = {
                'filter': f'last_known_institutions.id:{institution_id}',
                'per-page': 100,  # Max per page
                'sort': 'cited_by_count:desc',
                'cursor': cursor
            }

            response = call_openalex_api('authors', params)

            if not response or 'results' not in response:
                break

            researchers = response['results']
            if not researchers:
                break

            # Add researchers to the list
            all_researchers.extend(researchers)

            # Stop if we have enough researchers
            if len(all_researchers) >= max_researchers:
                all_researchers = all_researchers[:max_researchers]  # Trim excess
                break

            # Get next cursor from meta
            cursor = response.get('meta', {}).get('next_cursor')

            if not cursor:  # No more pages
                break

        except Exception as e:
            print(f"Error fetching researchers: {str(e)}")
            break

    return all_researchers

def get_researcher_works(researcher_id):
    """Get all works for a researcher using cursor pagination"""
    clean_id = researcher_id.split('/')[-1]

    base_params = {
        'filter': f'author.id:{clean_id}',
        'per-page': 200,
        'sort': 'cited_by_count:desc'
    }

    all_works = []
    cursor = '*'  # Initial cursor

    while cursor:
        try:
            params = base_params.copy()
            params['cursor'] = cursor

            response = call_openalex_api('works', params)

            if not response or 'results' not in response:
                break

            works = response['results']
            if not works:
                break

            all_works.extend(works)

            # Get next cursor from meta
            cursor = response.get('meta', {}).get('next_cursor')

            if not cursor:  # No more pages
                break

        except Exception as e:
            print(f"Error fetching works for researcher {clean_id}: {str(e)}")
            break

    return all_works

def get_top_collaborators(works, researcher_id):
    """Get top 5 collaborators for a researcher based on co-authorship"""
    collaborator_counts = Counter()

    for work in works:
        for authorship in work.get('authorships', []):
            coauthor_id = authorship.get('author', {}).get('id')
            if coauthor_id and coauthor_id != researcher_id:
                collaborator_counts[coauthor_id] += 1

    # Return top 5 collaborator IDs
    return [collab_id for collab_id, _ in collaborator_counts.most_common(5)]

def fetch_texas_state_researchers():
    """Fetch and process researchers affiliated with Texas State University"""
    # Get Texas State University ID
    texas_state_id = get_texas_state_id()
    if not texas_state_id:
        print("Could not find Texas State University ID")
        return

    print(f"Found Texas State University ID: {texas_state_id}")

    # Fetch top 500 cited researchers affiliated with Texas State University
    print("Fetching top 500 cited researchers from Texas State University...")
    researchers = fetch_top_cited_researchers(texas_state_id, max_researchers=500)

    if not researchers:
        print("Failed to fetch researchers")
        return

    researchers_data = []
    current_year = datetime.now().year

    print("\nProcessing researcher data...")
    for researcher in tqdm(researchers):
        try:
            # Basic info
            researcher_info = {
                'researcher_id': researcher['id'],
                'researcher_name': researcher['display_name'],
                'total_citations': researcher['cited_by_count'],
                'total_works': researcher['works_count'],
                'h_index': researcher.get('summary_stats', {}).get('h_index', 0),
                'i10_index': researcher.get('summary_stats', {}).get('i10_index', 0)
            }

            # Add concepts
            concepts = researcher.get('x_concepts', [])
            for i in range(2):
                if i < len(concepts):
                    researcher_info[f'concept_{i+1}'] = concepts[i]['display_name']
                    researcher_info[f'concept_{i+1}_score'] = concepts[i]['score']
                else:
                    researcher_info[f'concept_{i+1}'] = None
                    researcher_info[f'concept_{i+1}_score'] = None

            # Get works with cursor pagination
            works = get_researcher_works(researcher['id'])

            if works:
                # Years active
                publication_years = [w['publication_year'] for w in works if w.get('publication_year')]
                if publication_years:
                    researcher_info['years_active'] = max(publication_years) - min(publication_years) + 1
                else:
                    researcher_info['years_active'] = 0

                # Recent works and citations
                recent_works = [w for w in works if w.get('publication_year', 0) >= (current_year - 5)]
                researcher_info['recent_works_count'] = len(recent_works)
                researcher_info['recent_citations'] = sum(w.get('cited_by_count', 0) for w in recent_works)

                # Unique venues
                venues = set()
                for work in works:
                    if work.get('primary_location') and work['primary_location'].get('source'):
                        venue = work['primary_location']['source'].get('display_name')
                        if venue:
                            venues.add(venue)
                researcher_info['unique_venues'] = len(venues)

                # Average coauthors
                total_coauthors = sum(len(w.get('authorships', [])) - 1 for w in works)
                researcher_info['avg_coauthors'] = total_coauthors / len(works) if works else 0

                # Open access ratio
                oa_works = sum(1 for w in works if w.get('open_access', {}).get('is_oa', False))
                researcher_info['open_access_ratio'] = oa_works / len(works) if works else 0

                # Top 5 works (already sorted by cited_by_count from API)
                top_works = works[:5]
                for i, work in enumerate(top_works, 1):
                    researcher_info[f'top_work_{i}_id'] = work['id']
                    researcher_info[f'top_work_{i}_type'] = work['type']
                    researcher_info[f'top_work_{i}_is_oa'] = work['open_access']['is_oa']
                    researcher_info[f'top_work_{i}_keywords'] = ','.join([c['display_name'] for c in work.get('concepts', [])[:5]])

                    if work.get('primary_location') and work['primary_location'].get('source'):
                        researcher_info[f'top_work_{i}_source'] = work['primary_location']['source'].get('display_name', '')
                    else:
                        researcher_info[f'top_work_{i}_source'] = ''

                # Fill in missing top works
                for i in range(len(top_works) + 1, 6):
                    researcher_info[f'top_work_{i}_id'] = None
                    researcher_info[f'top_work_{i}_type'] = None
                    researcher_info[f'top_work_{i}_is_oa'] = None
                    researcher_info[f'top_work_{i}_keywords'] = None
                    researcher_info[f'top_work_{i}_source'] = None

                # Top collaborators
                top_collaborators = get_top_collaborators(works, researcher['id'])
                for i, collab_id in enumerate(top_collaborators, 1):
                    researcher_info[f'top_collaborator_{i}'] = collab_id

                # Fill in missing collaborators
                for i in range(len(top_collaborators) + 1, 6):
                    researcher_info[f'top_collaborator_{i}'] = None

            researchers_data.append(researcher_info)

        except Exception as e:
            print(f"Error processing researcher {researcher.get('id')}: {str(e)}")
            continue

    # Convert to dataframe and save
    df_researchers = pd.DataFrame(researchers_data)
    df_researchers.to_csv('top_500_researchers.csv', index=False)

    print("\nTop 500 Researchers CSV head:")
    print(df_researchers.head())

    print(f"\nNumber of researchers: {len(df_researchers)}")
    print(f"Number of columns: {len(df_researchers.columns)}")
    print("\nColumn names:")
    print(df_researchers.columns.tolist())

    return df_researchers

# Generate the researchers CSV
df_researchers = fetch_texas_state_researchers()

Found Texas State University ID: https://openalex.org/I13511017
Fetching top 500 cited researchers from Texas State University...

Processing researcher data...


100%|██████████| 500/500 [10:22<00:00,  1.24s/it]


Top 500 Researchers CSV head:
                      researcher_id     researcher_name  total_citations  \
0  https://openalex.org/A5046299069      Larry R. Price            26301   
1  https://openalex.org/A5039371296   Michael A. Huston            21477   
2  https://openalex.org/A5070163403       Marcus Felson            21278   
3  https://openalex.org/A5017593645  Togay Ozbakkaloglu            18853   
4  https://openalex.org/A5048215687          Eric Kirby            12428   

   total_works  h_index  i10_index          concept_1  concept_1_score  \
0          208       52        108   Computer science             73.1   
1           93       43         61            Biology             93.5   
2          161       40         70          Sociology             72.7   
3          345       75        207  Materials science             95.9   
4          278       49         88            Geology             93.2   

            concept_2  concept_2_score  ...                     top




In [None]:
df_researchers = pd.read_csv('top_500_researchers.csv')

In [None]:
df_researchers

Unnamed: 0,researcher_id,researcher_name,total_citations,total_works,h_index,i10_index,concept_1,concept_1_score,concept_2,concept_2_score,...,top_work_5_id,top_work_5_type,top_work_5_is_oa,top_work_5_keywords,top_work_5_source,top_collaborator_1,top_collaborator_2,top_collaborator_3,top_collaborator_4,top_collaborator_5
0,https://openalex.org/A5046299069,Larry R. Price,26301,208,52,108,Computer science,73.1,Mathematics,70.7,...,https://openalex.org/W2025106538,article,False,"Psychology,Exposure therapy,Virtual reality,Ps...",Journal of Consulting and Clinical Psychology,https://openalex.org/A5019375929,https://openalex.org/A5009004096,https://openalex.org/A5101870896,https://openalex.org/A5085670077,https://openalex.org/A5107483970
1,https://openalex.org/A5039371296,Michael A. Huston,21477,93,43,61,Biology,93.5,Ecology,86.0,...,https://openalex.org/W2088443113,article,False,"Microsite,Facilitation,Competition (biology),C...",Ecology,https://openalex.org/A5024934767,https://openalex.org/A5110483019,https://openalex.org/A5038398105,https://openalex.org/A5008653559,https://openalex.org/A5103521151
2,https://openalex.org/A5070163403,Marcus Felson,21278,161,40,70,Sociology,72.7,Psychology,65.2,...,https://openalex.org/W2311383940,article,False,"Consumption (sociology),Sociology,Computer sci...",American Behavioral Scientist,https://openalex.org/A5046958460,https://openalex.org/A5102936013,https://openalex.org/A5037722191,https://openalex.org/A5109563801,https://openalex.org/A5072647442
3,https://openalex.org/A5017593645,Togay Ozbakkaloglu,18853,345,75,207,Materials science,95.9,Composite material,95.1,...,https://openalex.org/W2330198871,review,False,"Materials science,Shrinkage,Flexural strength,...",Journal of Materials Science,https://openalex.org/A5082132566,https://openalex.org/A5043329482,https://openalex.org/A5040407357,https://openalex.org/A5011064916,https://openalex.org/A5019907987
4,https://openalex.org/A5048215687,Eric Kirby,12428,278,49,88,Geology,93.2,Biology,87.8,...,https://openalex.org/W2145347294,article,True,"Thermochronology,Geology,Denudation,Fission tr...",Tectonics,https://openalex.org/A5038349665,https://openalex.org/A5090086069,https://openalex.org/A5100685756,https://openalex.org/A5038670501,https://openalex.org/A5009378840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,https://openalex.org/A5023485684,Sarah A. Blue,495,40,11,12,Political science,87.5,Sociology,72.5,...,https://openalex.org/W3128923529,article,True,"Refugee,Precarity,Political science,Immigratio...",Social Sciences,https://openalex.org/A5033260789,https://openalex.org/A5061971733,https://openalex.org/A5071631028,https://openalex.org/A5077134305,https://openalex.org/A5090006328
496,https://openalex.org/A5090287107,Nestor Guillen,494,61,11,14,Mathematics,98.4,Mathematical analysis,86.9,...,https://openalex.org/W2043066177,article,True,"Mathematics,Obstacle problem,Hypersurface,Dime...",Calculus of Variations and Partial Differentia...,https://openalex.org/A5008620763,https://openalex.org/A5004919774,https://openalex.org/A5052239059,https://openalex.org/A5113182229,https://openalex.org/A5046772699
497,https://openalex.org/A5037599311,Bob Edward Vásquez,492,18,7,7,Psychology,94.4,Political science,88.9,...,https://openalex.org/W2005094432,article,False,"Juvenile delinquency,Friendship,Closeness,Psyc...",Journal of Criminal Justice,https://openalex.org/A5069510426,https://openalex.org/A5055937408,https://openalex.org/A5038854444,https://openalex.org/A5030063444,https://openalex.org/A5063852285
498,https://openalex.org/A5005882875,Darrell L. Ward,491,25,4,3,Computer science,80.0,Programming language,48.0,...,https://openalex.org/W1537711190,article,False,"Computer science,Programming language,Computer...",,https://openalex.org/A5113668156,https://openalex.org/A5078340012,https://openalex.org/A5073304952,https://openalex.org/A5043121653,https://openalex.org/A5074662740


In [None]:
df_researchers.dtypes

Unnamed: 0,0
researcher_id,object
researcher_name,object
total_citations,int64
total_works,int64
h_index,int64
i10_index,int64
concept_1,object
concept_1_score,float64
concept_2,object
concept_2_score,float64


In [None]:
df_researchers.shape

(500, 46)

In [None]:
grants = pd.read_csv('grants.csv')

In [None]:
grants.dtypes

Unnamed: 0,0
opportunity_id,int64
opportunity_number,object
opportunity_title,object
opportunity_status,object
agency,object
agency_code,object
agency_name,object
category,object
category_explanation,object
award_ceiling,float64


In [None]:
grants.shape

(5000, 20)

In [None]:
import pandas as pd
import numpy as np

# Load datasets
researchers_df = pd.read_csv('top_500_researchers.csv')
grants_df = pd.read_csv('grants.csv')

# Check missing values in researchers dataset
print("Missing values in researchers dataset:")
print(researchers_df.isnull().sum())
print("\nPercentage of missing values:")
print((researchers_df.isnull().sum() / len(researchers_df)) * 100)

# Check missing values in grants dataset
print("\nMissing values in grants dataset:")
print(grants_df.isnull().sum())
print("\nPercentage of missing values:")
print((grants_df.isnull().sum() / len(grants_df)) * 100)

# Handle missing values in researchers dataset
def clean_researchers_df(df):
    # Fill numeric columns with appropriate values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)

    # Fill object (string) columns
    object_cols = df.select_dtypes(include=['object']).columns
    df[object_cols] = df[object_cols].fillna('')

    # Fix boolean columns
    bool_cols = [col for col in df.columns if 'is_oa' in col]
    df[bool_cols] = df[bool_cols].fillna(False)

    return df

# Handle missing values in grants dataset
def clean_grants_df(df):
    # Fill numeric columns
    df['award_ceiling'] = df['award_ceiling'].fillna(0)
    df['award_floor'] = df['award_floor'].fillna(0)
    df['estimated_total_funding'] = df['estimated_total_funding'].fillna(0)
    df['expected_awards'] = df['expected_awards'].fillna(0)

    # Fill categorical columns
    categorical_cols = [
        'opportunity_number', 'opportunity_title', 'opportunity_status',
        'agency', 'agency_code', 'agency_name', 'category', 'category_explanation',
        'funding_categories', 'applicant_types', 'funding_instruments'
    ]
    df[categorical_cols] = df[categorical_cols].fillna('')

    # Fill date columns
    date_cols = ['post_date', 'close_date']
    df[date_cols] = df[date_cols].fillna('')

    # Fill description fields
    df['summary_description'] = df['summary_description'].fillna('')
    df['eligibility_description'] = df['eligibility_description'].fillna('')

    return df

# Clean both datasets
researchers_clean = clean_researchers_df(researchers_df.copy())
grants_clean = clean_grants_df(grants_df.copy())

# Verify no missing values remain
print("\nAfter cleaning - Missing values in researchers dataset:")
print(researchers_clean.isnull().sum().sum())
print("\nAfter cleaning - Missing values in grants dataset:")
print(grants_clean.isnull().sum().sum())

# Save cleaned datasets
researchers_clean.to_csv('researchers_clean.csv', index=False)
grants_clean.to_csv('grants_clean.csv', index=False)

# Print data quality report
print("\nData Quality Report:")
print("\nResearchers Dataset:")
print(f"Original shape: {researchers_df.shape}")
print(f"Cleaned shape: {researchers_clean.shape}")
print("\nGrants Dataset:")
print(f"Original shape: {grants_df.shape}")
print(f"Cleaned shape: {grants_clean.shape}")

# Display sample statistics
print("\nSample statistics after cleaning:")
print("\nResearchers numeric columns:")
print(researchers_clean.describe().round(2))
print("\nGrants numeric columns:")
print(grants_clean.describe().round(2))

Missing values in researchers dataset:
researcher_id           0
researcher_name         0
total_citations         0
total_works             0
h_index                 0
i10_index               0
concept_1               0
concept_1_score         0
concept_2               0
concept_2_score         0
years_active            0
recent_works_count      0
recent_citations        0
unique_venues           0
avg_coauthors           0
open_access_ratio       0
top_work_1_id           0
top_work_1_type         0
top_work_1_is_oa        0
top_work_1_keywords     0
top_work_1_source      24
top_work_2_id           5
top_work_2_type         5
top_work_2_is_oa        5
top_work_2_keywords     5
top_work_2_source      34
top_work_3_id           5
top_work_3_type         5
top_work_3_is_oa        5
top_work_3_keywords     5
top_work_3_source      33
top_work_4_id           6
top_work_4_type         6
top_work_4_is_oa        6
top_work_4_keywords     6
top_work_4_source      38
top_work_5_id           8

In [None]:
grants_df.head()

Unnamed: 0,opportunity_id,opportunity_number,opportunity_title,opportunity_status,agency,agency_code,agency_name,category,category_explanation,award_ceiling,award_floor,estimated_total_funding,expected_awards,post_date,close_date,funding_categories,applicant_types,funding_instruments,summary_description,eligibility_description
0,15936,07HQPA0028,"Cooperative Ecosystem Studies Unit, Rocky Moun...",archived,DOI-USGS1,DOI-USGS1,Geological Survey,discretionary,,24957.0,24957.0,24957.0,1.0,2007-07-09,2007-07-19,science_technology_and_other_research_and_deve...,other,cooperative_agreement,The U.S. Geological Surveys is offering a coo...,This financial assistance opportunity is being...
1,15937,ED-GRANTS-070907-001,Youth violence and related issues in persisten...,archived,ED,ED,Department of Education,discretionary,,0.0,0.0,8594000.0,13.0,2007-07-09,2007-08-08,education,other,grant,Note: Each funding opportunity description i...,Eligible Applicants: LEAs in which at least on...
2,15939,ED-GRANTS-070907-002,State data collection; technical assistance CF...,archived,ED,ED,Department of Education,discretionary,,400000.0,0.0,13500000.0,0.0,2007-07-09,2007-08-23,education,other,grant,Note: Each funding opportunity description i...,Note: Eligible entities must submit separate a...
3,15941,NNH07ZEA001N-EDL1,"ROA 2007: A.6 Entry, Descent, and Landing 1",archived,NASA-HQ,NASA-HQ,NASA Headquarters,discretionary,,0.0,0.0,0.0,0.0,2007-07-09,2007-08-20,science_technology_and_other_research_and_deve...,unrestricted,"cooperative_agreement,grant",The National Aeronautics and Space Administrat...,Unrestricted
4,17675,CNCS-GRANTS-071307-001,Volunteer Management,archived,CNCS,CNCS,Corporation for National and Community Service,discretionary,,200000.0,50000.0,800000.0,6.0,2007-07-13,2007-09-06,"disaster_prevention_and_relief,regional_develo...","county_governments,nonprofits_non_higher_educa...",grant,The Corporation for National and Community Ser...,The Corporation wants to ensure that all eligi...


# removed the data points with mising eligibility description cause this data is tangible and needs to be there.

In [None]:
import pandas as pd
import numpy as np

# Load grants data
grants_df = pd.read_csv('grants_clean.csv')

# Remove category_explanation column
grants_df = grants_df.drop('category_explanation', axis=1)

# Remove rows with missing eligibility description
grants_df_clean = grants_df[grants_df['eligibility_description'].notna()]

# Print summary of changes
print("Original dataset shape:", grants_df.shape)
print("New dataset shape:", grants_df_clean.shape)
print(f"Removed {grants_df.shape[0] - grants_df_clean.shape[0]} rows with missing eligibility descriptions")

# Display data info
print("\nColumns in cleaned dataset:")
for col in grants_df_clean.columns:
    print(f"- {col}")

print("\nMissing values in cleaned dataset:")
print(grants_df_clean.isnull().sum())

# Show sample of monetary columns
print("\nMonetary fields summary:")
money_cols = ['award_ceiling', 'award_floor', 'estimated_total_funding']
print(grants_df_clean[money_cols].describe())

# Save cleaned dataset
grants_df_clean.to_csv('grants_final.csv', index=False)

# Show head of cleaned dataset with better formatting
print("\nSample of cleaned dataset:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(grants_df_clean.head())

Original dataset shape: (5000, 19)
New dataset shape: (3470, 19)
Removed 1530 rows with missing eligibility descriptions

Columns in cleaned dataset:
- opportunity_id
- opportunity_number
- opportunity_title
- opportunity_status
- agency
- agency_code
- agency_name
- category
- award_ceiling
- award_floor
- estimated_total_funding
- expected_awards
- post_date
- close_date
- funding_categories
- applicant_types
- funding_instruments
- summary_description
- eligibility_description

Missing values in cleaned dataset:
opportunity_id               0
opportunity_number           0
opportunity_title            0
opportunity_status           0
agency                       0
agency_code                  0
agency_name                  0
category                     1
award_ceiling                0
award_floor                  0
estimated_total_funding      0
expected_awards              0
post_date                    0
close_date                 332
funding_categories           2
applicant_type

# Feature Engineering


In [None]:
def clean_grants_data(grants_df):
    # Make a copy
    df = grants_df.copy()

    # Handle missing values
    df['close_date'] = 'unknown'  # Set all close dates to unknown for now
    df['category'] = df['category'].fillna('unknown')  # Fill missing category

    # Convert monetary values to float and handle missing values
    numeric_cols = ['award_ceiling', 'award_floor', 'estimated_total_funding', 'expected_awards']
    df[numeric_cols] = df[numeric_cols].fillna(0.0)

    # Clean up text fields
    text_cols = ['funding_categories', 'applicant_types', 'funding_instruments',
                 'summary_description', 'eligibility_description']
    df[text_cols] = df[text_cols].fillna('')

    return df

In [None]:
def engineer_grant_features(grants_df):
    df = grants_df.copy()

    # Create amount-related features
    df['has_funding_limit'] = (df['award_ceiling'] > 0).astype(int)
    df['funding_range'] = df['award_ceiling'] - df['award_floor']

    # Text-based features
    df['description_length'] = df['summary_description'].str.len()
    df['eligibility_length'] = df['eligibility_description'].str.len()

    # Categorical encoding
    df['funding_types_count'] = df['funding_instruments'].str.count(',') + 1
    df['applicant_types_count'] = df['applicant_types'].str.count(',') + 1

    # Agency grouping
    df['agency_group'] = df['agency_name'].map(lambda x: x.split()[0] if pd.notna(x) else 'unknown')

    return df

In [None]:
def engineer_researcher_features(researchers_df):
    df = researchers_df.copy()

    # Research impact features
    df['impact_ratio'] = df['total_citations'] / df['total_works'].clip(lower=1)
    df['recent_impact_ratio'] = df['recent_citations'] / df['recent_works_count'].clip(lower=1)

    # Collaboration metrics
    df['collaboration_score'] = df['avg_coauthors'] * df['unique_venues']

    # Research diversity
    df['venue_per_work'] = df['unique_venues'] / df['total_works'].clip(lower=1)

    # Career stage indicators
    df['career_duration'] = df['years_active']
    df['productivity_rate'] = df['total_works'] / df['years_active'].clip(lower=1)

    return df

In [None]:
def create_matching_features(grants_df, researchers_df):
    # Convert summary descriptions to embeddings using the same model from the notebook
    model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID)

    grant_embeddings = model.encode(grants_df['summary_description'].tolist())
    researcher_embeddings = model.encode(researchers_df['concept_1'].astype(str).tolist())

    # Create similarity features
    similarities = cosine_similarity(grant_embeddings, researcher_embeddings)

    return similarities

In [None]:
def prepare_training_data(grants_df, researchers_df, similarities):
    training_data = []

    for i, grant in grants_df.iterrows():
        for j, researcher in researchers_df.iterrows():
            features = {
                'grant_id': grant['opportunity_id'],
                'researcher_id': researcher['researcher_id'],
                'similarity_score': similarities[i][j],
                'amount_match': 1 if researcher['total_citations'] > grant['award_floor'] else 0,
                'career_stage_match': 1 if researcher['years_active'] >= 5 else 0,
                # Add more matching features
            }
            training_data.append(features)

    return pd.DataFrame(training_data)

# feature engineering using hopsworks


In [None]:
!pip install hopsworks

Collecting hopsworks
  Downloading hopsworks-4.1.4-py3-none-any.whl.metadata (11 kB)
Collecting pyhumps==1.6.1 (from hopsworks)
  Downloading pyhumps-1.6.1-py3-none-any.whl.metadata (3.7 kB)
Collecting furl (from hopsworks)
  Downloading furl-2.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting boto3 (from hopsworks)
  Downloading boto3-1.35.94-py3-none-any.whl.metadata (6.7 kB)
Collecting pandas<2.2.0 (from hopsworks)
  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pyjks (from hopsworks)
  Downloading pyjks-20.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting mock (from hopsworks)
  Downloading mock-5.1.0-py3-none-any.whl.metadata (3.0 kB)
Collecting avro==1.11.3 (from hopsworks)
  Downloading avro-1.11.3.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.6/90.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting require

In [None]:
!pip install loguru

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3


In [None]:
# Install required libraries
!pip install hopsworks pandas loguru

# Import required libraries
import hopsworks
import pandas as pd
from loguru import logger

# Define settings (replace with your actual settings)
class Settings:
    def __init__(self):
        self.HOPSWORKS_API_KEY = None  # Replace with your Hopsworks API key

    def get_secret_value(self):
        return self.HOPSWORKS_API_KEY

# Initialize settings
settings = Settings()

# Define constants (replace with your actual constants)
class Constants:
    FEATURE_STORE_NAME = "findandfund_feature_store"

constants = Constants()

def get_feature_store():
    """Connect to Hopsworks and return the project and feature store."""
    if settings.HOPSWORKS_API_KEY:
        logger.info("Logging to Hopsworks using HOPSWORKS_API_KEY env var.")
        project = hopsworks.login(
            api_key_value=settings.HOPSWORKS_API_KEY,  # No .get_secret_value() needed
            host='c.app.hopsworks.ai',  # Use the host from your URL
            project="findandfund"
        )
    else:
        logger.info("Login to Hopsworks using cached API key.")
        project = hopsworks.login(
            host='c.app.hopsworks.ai',  # Use the host from your URL
            project="findandfund"
        )

    # Get the feature store
    fs = project.get_feature_store()
    logger.info("Successfully retrieved the feature store.")

    return project, fs

# Example usage
try:
    # Set your Hopsworks API key (replace with your actual key)
    settings.HOPSWORKS_API_KEY = "E9uA06PYlE5ktSqe.Ty7r4OBfHnG6tiiuYl8BSMRi9Ap8eeCpwhYwUYGzFv8NOrtSm4vFgs38fvO0FPgn"

    # Connect to Hopsworks
    project, fs = get_feature_store()
    logger.info(f"Connected to Hopsworks project: {project.name}")
    logger.info(f"Feature store: {fs.name}")

except Exception as e:
    logger.error(f"Failed to connect to Hopsworks: {str(e)}")
    logger.warning("Proceeding with local file storage only.")



[32m2025-01-08 19:43:07.092[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_feature_store[0m:[36m29[0m - [1mLogging to Hopsworks using HOPSWORKS_API_KEY env var.[0m
[32m2025-01-08 19:43:08.617[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_feature_store[0m:[36m44[0m - [1mSuccessfully retrieved the feature store.[0m
[32m2025-01-08 19:43:08.625[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 49>[0m:[36m55[0m - [1mConnected to Hopsworks project: findandfund[0m
[32m2025-01-08 19:43:08.628[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 49>[0m:[36m56[0m - [1mFeature store: findandfund_featurestore[0m



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207504


In [None]:
def verify_hopsworks_connection():
    """Verify connection to Hopsworks and create/connect to the 'findandfund' project"""
    if not HOPSWORKS_API_KEY:
        logger.warning("Hopsworks API key not found. Proceeding with local file storage only.")
        return None

    try:
        # Log the attempt to connect
        logger.info("Attempting to connect to Hopsworks...")

        # Connect to Hopsworks
        connection = hopsworks.connection(
            host='your_hopsworks_host',  # Replace with your Hopsworks host (e.g., [UUID].cloud.hopsworks.ai)
            port=443,
            project="findandfund",
            api_key_value=HOPSWORKS_API_KEY,
            hostname_verification=True
        )
        logger.info("Connected to Hopsworks successfully!")

        # Get the feature store
        fs = connection.get_feature_store()
        logger.info("Successfully retrieved the feature store.")

        return fs

    except Exception as e:
        logger.error(f"Failed to connect to Hopsworks: {str(e)}")
        return None

In [None]:
# Install required libraries with Hopsworks Python extras
!pip install hopsworks[python] pandas numpy sentence-transformers scikit-learn loguru tqdm

# Restart the runtime to apply the changes
import os
os.kill(os.getpid(), 9)  # This will restart the Colab runtime

# After the runtime restarts, re-run the following code:

# Import required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import hopsworks
from tqdm import tqdm
import time
from loguru import logger
from google.colab import drive, userdata
import os

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Create directory for our data
SAVE_PATH = '/content/drive/My Drive/research_grants_data'
os.makedirs(SAVE_PATH, exist_ok=True)

# Get Hopsworks API key securely
try:
    HOPSWORKS_API_KEY = userdata.get('HOPSWORKS_API_KEY')
except Exception as e:
    logger.warning(f"Failed to retrieve Hopsworks API key: {str(e)}")
    HOPSWORKS_API_KEY = None

def get_feature_store():
    """Connect to Hopsworks and return the project and feature store."""
    if HOPSWORKS_API_KEY:
        logger.info("Logging to Hopsworks using HOPSWORKS_API_KEY env var.")
        project = hopsworks.login(
            api_key_value=HOPSWORKS_API_KEY,  # No .get_secret_value() needed
            host='c.app.hopsworks.ai',  # Use the host from your URL
            project="findandfund"
        )
    else:
        logger.info("Login to Hopsworks using cached API key.")
        project = hopsworks.login(
            host='c.app.hopsworks.ai',  # Use the host from your URL
            project="findandfund"
        )

    # Get the feature store
    fs = project.get_feature_store()
    logger.info("Successfully retrieved the feature store.")

    return project, fs

def clean_and_engineer_grants(grants_df):
    """Clean and engineer features for grants data"""
    df = grants_df.copy()

    # Basic cleaning
    df['close_date'] = 'unknown'
    df['category'] = df['category'].fillna('unknown')

    # Convert monetary values and handle missing
    numeric_cols = ['award_ceiling', 'award_floor', 'estimated_total_funding', 'expected_awards']
    df[numeric_cols] = df[numeric_cols].fillna(0.0)

    # Convert post_date to datetime (TIMESTAMP)
    df['post_date'] = pd.to_datetime(df['post_date'], errors='coerce')  # Coerce invalid dates to NaT

    # Feature Engineering
    df['has_funding_limit'] = (df['award_ceiling'] > 0).astype(int)
    df['funding_range'] = df['award_ceiling'] - df['award_floor']
    df['description_length'] = df['summary_description'].str.len()
    df['eligibility_length'] = df['eligibility_description'].str.len()
    df['funding_types_count'] = df['funding_instruments'].str.count(',') + 1
    df['applicant_types_count'] = df['applicant_types'].str.count(',') + 1
    df['agency_group'] = df['agency_name'].map(lambda x: x.split()[0] if pd.notna(x) else 'unknown')

    # Save processed grants
    df.to_csv(f'{SAVE_PATH}/processed_grants.csv', index=False)

    return df

def engineer_researcher_features(researchers_df):
    """Engineer features for researcher data"""
    df = researchers_df.copy()

    # Research impact features
    df['impact_ratio'] = df['total_citations'] / df['total_works'].clip(lower=1)
    df['recent_impact_ratio'] = df['recent_citations'] / df['recent_works_count'].clip(lower=1)
    df['collaboration_score'] = df['avg_coauthors'] * df['unique_venues']
    df['venue_per_work'] = df['unique_venues'] / df['total_works'].clip(lower=1)
    df['productivity_rate'] = df['total_works'] / df['years_active'].clip(lower=1)

    # Save processed researchers
    df.to_csv(f'{SAVE_PATH}/processed_researchers.csv', index=False)

    return df

def create_embedding_features(grants_df, researchers_df, model_name='all-MiniLM-L6-v2'):
    """Create embeddings and compute similarities"""
    # Load model
    model = SentenceTransformer(model_name)

    # Create embeddings
    logger.info("Creating grant embeddings...")
    grant_descriptions = grants_df['summary_description'].fillna('').tolist()
    grant_embeddings = model.encode(grant_descriptions, show_progress_bar=True)

    logger.info("Creating researcher embeddings...")
    researcher_descriptions = researchers_df.apply(
        lambda x: f"{x['concept_1']} {x['concept_2']}", axis=1
    ).tolist()
    researcher_embeddings = model.encode(researcher_descriptions, show_progress_bar=True)

    # Save embeddings
    np.save(f'{SAVE_PATH}/grant_embeddings.npy', grant_embeddings)
    np.save(f'{SAVE_PATH}/researcher_embeddings.npy', researcher_embeddings)

    # Compute similarities
    logger.info("Computing similarities...")
    similarities = cosine_similarity(grant_embeddings, researcher_embeddings)
    np.save(f'{SAVE_PATH}/similarities.npy', similarities)

    return similarities

def create_matching_features(grants_df, researchers_df):
    """Create features for grant-researcher matching"""
    # Load saved similarities
    similarities = np.load(f'{SAVE_PATH}/similarities.npy')

    matches = []

    logger.info(f"Creating matches for {len(grants_df)} grants and {len(researchers_df)} researchers...")
    for i, grant in tqdm(grants_df.iterrows(), total=len(grants_df)):
        for j, researcher in researchers_df.iterrows():
            match_features = {
                'grant_id': grant['opportunity_id'],
                'researcher_id': researcher['researcher_id'],
                'similarity_score': similarities[i][j],

                # Grant features
                'grant_award_ceiling': grant['award_ceiling'],
                'grant_funding_range': grant['funding_range'],
                'grant_types_count': grant['funding_types_count'],

                # Researcher features
                'researcher_impact': researcher['impact_ratio'],
                'researcher_recent_impact': researcher['recent_impact_ratio'],
                'researcher_collaboration': researcher['collaboration_score'],
                'researcher_productivity': researcher['productivity_rate'],

                # Matching features
                'field_match': 1 if researcher['concept_1'] in str(grant['summary_description']) else 0,
                'career_stage_match': 1 if researcher['years_active'] >= 5 else 0
            }
            matches.append(match_features)

    matching_df = pd.DataFrame(matches)
    return matching_df

def create_hopsworks_feature_groups(grants_df, researchers_df, matching_features):
    """Create and upload feature groups to Hopsworks within the 'findandfund' project"""
    try:
        # Connect to Hopsworks
        project, fs = get_feature_store()
        if fs is None:
            raise Exception("Failed to connect to Hopsworks")

        # Create grants feature group
        logger.info("Creating grants feature group...")
        grants_fg = fs.get_or_create_feature_group(
            name='grants',
            version=1,
            primary_key=['opportunity_id'],
            description='Processed grants features',
            online_enabled=True,
            event_time='post_date'  # Ensure this is a valid TIMESTAMP column
        )
        grants_fg.insert(grants_df, write_options={"wait_for_job": True})

        # Create researchers feature group
        logger.info("Creating researchers feature group...")
        researchers_fg = fs.get_or_create_feature_group(
            name='researchers',
            version=1,
            primary_key=['researcher_id'],
            description='Processed researcher features',
            online_enabled=True
        )
        researchers_fg.insert(researchers_df, write_options={"wait_for_job": True})

        # Create matching feature group
        logger.info("Creating matching feature group...")
        matching_fg = fs.get_or_create_feature_group(
            name='grant_researcher_matching',
            version=1,
            primary_key=['grant_id', 'researcher_id'],
            description='Grant-researcher matching features',
            online_enabled=True
        )
        matching_fg.insert(matching_features, write_options={"wait_for_job": True})

        return grants_fg, researchers_fg, matching_fg

    except Exception as e:
        logger.error(f"Error creating feature groups: {str(e)}")
        # Save the data locally if Hopsworks upload fails
        grants_df.to_csv(f'{SAVE_PATH}/grants_features.csv', index=False)
        researchers_df.to_csv(f'{SAVE_PATH}/researchers_features.csv', index=False)
        matching_features.to_csv(f'{SAVE_PATH}/matching_features.csv', index=False)
        logger.info(f"Data saved locally to {SAVE_PATH}")
        return None, None, None

def main():
    start_time = time.time()

    # Initialize variables to avoid UnboundLocalError
    grants_processed = None
    researchers_processed = None
    matching_features = None

    try:
        # Load your existing dataframes from the Colab environment
        logger.info("Loading data...")
        grants_df = pd.read_csv('grants_clean.csv')
        researchers_df = pd.read_csv('researchers_clean.csv')

        # Process data
        logger.info("Processing grants data...")
        grants_processed = clean_and_engineer_grants(grants_df)

        logger.info("Processing researcher data...")
        researchers_processed = engineer_researcher_features(researchers_df)

        # Create embeddings and similarities
        logger.info("Creating embeddings and computing similarities...")
        create_embedding_features(grants_processed, researchers_processed)

        # Create matching features
        logger.info("Creating matching features...")
        matching_features = create_matching_features(grants_processed, researchers_processed)
        matching_features.to_csv(f'{SAVE_PATH}/matching_features.csv', index=False)

        # Create Hopsworks feature groups with error handling
        logger.info("Creating Hopsworks feature groups...")
        grants_fg, researchers_fg, matching_fg = create_hopsworks_feature_groups(
            grants_processed, researchers_processed, matching_features
        )

        if all([grants_fg, researchers_fg, matching_fg]):
            # Create training/validation split
            logger.info("Creating train/validation split...")
            train_df = matching_features.sample(frac=0.8, random_state=42)
            val_df = matching_features.drop(train_df.index)

            # Save splits
            train_df.to_csv(f'{SAVE_PATH}/train_data.csv', index=False)
            val_df.to_csv(f'{SAVE_PATH}/val_data.csv', index=False)

        execution_time = time.time() - start_time
        logger.info(f"Total execution time: {execution_time/60:.2f} minutes")
        logger.info(f"All files saved to: {SAVE_PATH}")

    except KeyboardInterrupt:
        logger.warning("Process interrupted by user. Saving intermediate results...")
        # Save any intermediate results that were generated
        for df_name, df in [
            ('grants_processed.csv', grants_processed),
            ('researchers_processed.csv', researchers_processed),
            ('matching_features.csv', matching_features)
        ]:
            if df is not None:
                df.to_csv(f'{SAVE_PATH}/{df_name}', index=False)
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        logger.info("Saving intermediate results...")
        # Save any intermediate results that were generated
        for df_name, df in [
            ('grants_processed.csv', grants_processed),
            ('researchers_processed.csv', researchers_processed),
            ('matching_features.csv', matching_features)
        ]:
            if df is not None:
                df.to_csv(f'{SAVE_PATH}/{df_name}', index=False)

# Verify Hopsworks connection before running main
project, fs = get_feature_store()
if fs is None:
    logger.warning("Proceeding with local file storage only")

if __name__ == "__main__":
    main()

