In [1]:
# Install required libraries (run once per session)
!pip install transformers sentence-transformers fuzzywuzzy python-Levenshtein prophet -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

### AI AGENT FOR ROUTING

In [2]:
import pandas as pd
import gc
from fuzzywuzzy import process, fuzz
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

2025-05-31 17:57:25.145615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748714245.347599      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748714245.406249      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Custom Dataset for batching
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Fuzzy matching function
def fuzzy_match_orgs(source, target, threshold=80):
    matches = {}
    for org in source['org_name'].dropna().unique():
        result = process.extractOne(org.lower().strip(), target['NGO Name'].str.lower().str.strip(), scorer=fuzz.partial_ratio)
        if result is not None:
            best_match, score, index = result
            if score >= threshold:
                matches[org] = best_match
        matches[org] = matches.get(org, org)
    return matches


In [4]:
# Load and preprocess datasets
refugee_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/data_3countries_refugees_public.csv', 
                           low_memory=False, dtype={'surveylocation': 'category', 'education_years3': 'float32', 'health': 'float32'})
op_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/hdx_hapi_operational_presence_global.csv', 
                      low_memory=False, dtype={'org_name': 'category', 'location_code': 'category', 'sector_name': 'category'})
ngo_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/NGO_List_with_Focus_Areas.csv', 
                       dtype={'NGO Name': 'category', 'Country': 'category', 'Area of Focus': 'category'})

# Apply fuzzy matching
op_data['org_name'] = op_data['org_name'].astype(str).str.lower().str.strip()
ngo_data['NGO Name'] = ngo_data['NGO Name'].astype(str).str.lower().str.strip()
org_matches = fuzzy_match_orgs(op_data, ngo_data)
op_data['matched_ngo'] = op_data['org_name'].map(lambda x: org_matches.get(x, x))

# Merge datasets
merged_data = pd.merge(op_data, ngo_data, left_on='matched_ngo', right_on='NGO Name', how='left')
merged_data = pd.merge(merged_data, refugee_data[['surveylocation', 'education_years3', 'health']], 
                      left_on='location_code', right_on='surveylocation', how='left')

# Remove duplicates to prevent data explosion
merged_data = merged_data.drop_duplicates(subset=['org_name', 'location_code', 'sector_name'])

# Prepare data for DistilBERT
sector_map = {'Health': 0, 'Protection': 1, 'Education': 2}  # Adjust based on actual sectors
merged_data['sector_label'] = merged_data['sector_name'].map(lambda x: sector_map.get(x, -1))

In [5]:
# Filter for labeled data and split into train and validation sets
train_data = merged_data[merged_data['sector_label'] != -1].copy()
texts = (train_data['sector_name'].astype(str).fillna('') + " " + 
         train_data['org_name'].astype(str).fillna('') + " " + 
         train_data['location_code'].astype(str).fillna('')).tolist()
labels = train_data['sector_label'].tolist()
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [6]:
refugee_data.head(10)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,hhid_new,individualid_new,hh_dummy3,surveylocation,nationality_cat,context,strata3,su1,fpc_su1,weight_ind,...,skillednonmanual,otherwork,incentive,remittances_USD_w99,hhincome_ww,hhincome_ww_pc,hh_m_income_ALL_USD_w99_ww_pc,hh_remittances_USD_w99_ww_pc,hh_remittances_USD_w99_ww_d,hh_m_income_aidsupport_USD_pc
0,11100040,1110004001,Yes,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,13.625,...,,,,156.49449,771.29431,85.699371,31.795708,53.90366,1.0,0.0
1,11100040,1110004002,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,0.0,0.0,0.0,156.49449,,,,,,
2,11100040,1110004003,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,0.0,0.0,0.0,62.597797,,,,,,
3,11100040,1110004004,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,0.0,0.0,0.0,0.0,,,,,,
4,11100040,1110004005,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,,,,0.0,,,,,,
5,11100043,1110004301,Yes,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,13.625,...,,,,312.98898,578.4707,72.308838,11.178178,61.130661,1.0,0.0
6,11100043,1110004302,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,,,,140.84505,,,,,,
7,11100043,1110004303,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,0.0,0.0,0.0,0.0,,,,,,
8,11100043,1110004304,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,,,,0.0,,,,,,
9,11100043,1110004305,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,,,,0.0,,,,,,


In [7]:
refugee_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8996 entries, 0 to 8995
Columns: 231 entries, hhid_new to hh_m_income_aidsupport_USD_pc
dtypes: category(1), float32(2), float64(46), int64(6), object(176)
memory usage: 15.7+ MB


In [8]:
refugee_data.describe()

Unnamed: 0,strata3,su1,fpc_su1,weight_ind,somali,location3_id,enumerator_loc_id,year_leave,year_arrive,years_hostsite,...,skillednonmanual,otherwork,incentive,remittances_USD_w99,hhincome_ww,hhincome_ww_pc,hh_m_income_ALL_USD_w99_ww_pc,hh_remittances_USD_w99_ww_pc,hh_remittances_USD_w99_ww_d,hh_m_income_aidsupport_USD_pc
count,8996.0,8996.0,8996.0,8972.0,8996.0,8996.0,8989.0,8906.0,8906.0,8906.0,...,2824.0,2824.0,2824.0,8967.0,3639.0,3631.0,3642.0,3645.0,3654.0,3650.0
mean,151855.1,1186265.0,1443.658181,27.532659,0.602601,132.939195,82.766826,2010.882551,2011.643611,5.913317,...,0.041431,0.036827,0.081445,30.850685,197.958816,44.158239,14.588708,20.137084,0.425835,9.336445
std,346749.9,3340397.0,1830.36983,37.075568,0.489387,76.248566,45.527359,5.315991,4.754747,4.734442,...,0.199319,0.188371,0.273565,84.447633,203.34679,57.74264,33.337021,48.73748,0.494537,10.889144
min,1111.0,101.0,1.0,1.0,0.0,1.0,2.0,1968.0,1968.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2012.0,1148.0,69.0,10.577586,0.0,79.0,44.0,2009.0,2010.0,3.0,...,0.0,0.0,0.0,0.0,81.889435,13.648239,0.0,0.0,0.0,0.0
50%,3011.0,10965.5,426.0,13.962656,1.0,101.0,78.0,2011.0,2012.0,6.0,...,0.0,0.0,0.0,0.0,136.48239,22.546541,1.340447,0.0,0.0,9.572502
75%,31117.0,76199.25,2941.0,42.155338,1.0,220.0,120.0,2014.0,2015.0,7.0,...,0.0,0.0,0.0,6.415094,233.380915,52.637836,14.588611,18.527424,1.0,13.648239
max,1012200.0,12500160.0,5759.0,625.15631,1.0,264.0,169.0,2018.0,2019.0,50.0,...,1.0,1.0,1.0,751.17358,2773.6785,751.17358,423.28033,751.17358,1.0,177.42711


In [9]:
refugee_data.columns.tolist()

['hhid_new',
 'individualid_new',
 'hh_dummy3',
 'surveylocation',
 'nationality_cat',
 'context',
 'strata3',
 'su1',
 'fpc_su1',
 'weight_ind',
 'somali',
 'urban',
 'location3',
 'location3_id',
 'enumerator_loc_id',
 'year_leave',
 'year_arrive',
 'years_hostsite',
 'age',
 'gender',
 'relationship3',
 'maritalstatus',
 'maritalstatus_married',
 'religion3',
 'education_years3',
 'educfather_years3',
 'educmother_years3',
 'vocational',
 'likert_local',
 'local_language',
 'likert_well_en',
 'english',
 'hf_hhh_gender',
 'hf_people',
 'hf_dependency',
 'hf_hhhistory_urban',
 'job',
 'findjob3',
 'm_income_TOT_ALL_3',
 'hhincome_ww_lcu',
 'hhincome_ww_lcu_pc',
 'wageincome_pc',
 'remit_pc',
 'activity3',
 'employee3',
 'remittances_monthly',
 'remittances_d',
 'assets',
 'assetshh_radio',
 'assetshh_television',
 'assetshh_refrigerator',
 'assetshh_solarpanel',
 'assetshh_table',
 'assetshh_chair',
 'assetshh_sofa',
 'assetshh_bed',
 'assetshh_cupboard',
 'assetshh_clock',
 'assetsi

In [10]:
op_data.head(10)

Unnamed: 0,location_code,has_hrp,in_gho,provider_admin1_name,provider_admin2_name,admin1_code,admin1_name,admin2_code,admin2_name,admin_level,...,org_type_description,sector_code,sector_name,reference_period_start,reference_period_end,dataset_hdx_id,resource_hdx_id,warning,error,matched_ngo
0,#country+code,#meta+has_hrp,#meta+in_gho,#adm1+name+provider,#adm2+name+provider,#adm1+code,#adm1+name,#adm2+code,#adm2+name,#adm+level,...,#org+type+desc,#sector+code,#sector+name,#date+start,#date+end,#meta+dataset_id,#meta+resource_id,#meta+warning,#meta+error,#org+name
1,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,National NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,afghanistan development & welfare services org...
2,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,aga khan agency for habitat
3,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International Organization,HEA,Health,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,hope foundation
4,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,HEA,Health,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,aga khan health service
5,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,NUT,Nutrition,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,aga khan health service
6,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,National NGO,NUT,Nutrition,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,bakhtar development network
7,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,National NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,coordination of humanitarian assistance
8,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,SHL,Emergency Shelter and NFI,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,concern worldwide
9,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,WSH,Water Sanitation Hygiene,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,concern worldwide


In [11]:
op_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44037 entries, 0 to 44036
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   location_code           44037 non-null  category
 1   has_hrp                 44037 non-null  object  
 2   in_gho                  44037 non-null  object  
 3   provider_admin1_name    43209 non-null  object  
 4   provider_admin2_name    42008 non-null  object  
 5   admin1_code             43737 non-null  object  
 6   admin1_name             43737 non-null  object  
 7   admin2_code             41696 non-null  object  
 8   admin2_name             41696 non-null  object  
 9   admin_level             44037 non-null  object  
 10  org_acronym             43848 non-null  object  
 11  org_name                44037 non-null  object  
 12  org_type_description    42100 non-null  object  
 13  sector_code             43407 non-null  object  
 14  sector_name           

In [12]:
op_data.describe()

Unnamed: 0,location_code,has_hrp,in_gho,provider_admin1_name,provider_admin2_name,admin1_code,admin1_name,admin2_code,admin2_name,admin_level,...,org_type_description,sector_code,sector_name,reference_period_start,reference_period_end,dataset_hdx_id,resource_hdx_id,warning,error,matched_ngo
count,44037,44037,44037,43209,42008,43737,43737,41696,41696,44037,...,42100,43407,43407,44037,44037,44037,44037,1836,643,44037
unique,27,3,2,427,2930,364,354,2983,2841,4,...,12,19,19,17,17,27,27,93,14,1918
top,AFG,Y,Y,Ta'iz,Sucre,YE15,Ta'iz,CM007005,Sucre,2,...,National NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,PCode length TD14->TCD14,Unknown sector Coord. & Log. & Support Services,interaction
freq,10934,41019,44036,1152,156,1152,1152,149,156,41864,...,15303,8979,8979,21735,18498,10934,10934,264,438,3168


In [13]:
# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(sector_map))

# Set up device and multi-GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)
model = model.to(device)
print(f"Using device: {device}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs!
Using device: cuda


In [14]:
# Create dataset and dataloader with batching
dataset_train = TextDataset(texts_train, labels_train, tokenizer)
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True)
dataset_val = TextDataset(texts_val, labels_val, tokenizer)
dataloader_val = DataLoader(dataset_val, batch_size=64)

# Training loop with proper loss aggregation
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [15]:
# Training loop with proper loss aggregation
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(10):
    total_loss = 0
    for batch in dataloader_train:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader_train)
    print(f"Epoch {epoch+1}, Training Loss: {avg_loss}")



Epoch 1, Training Loss: 0.43069850011118527
Epoch 2, Training Loss: 0.010836858430813098
Epoch 3, Training Loss: 0.004061167650245901
Epoch 4, Training Loss: 0.002514495690963392
Epoch 5, Training Loss: 0.0017262563703517462
Epoch 6, Training Loss: 0.0012919620394000206
Epoch 7, Training Loss: 0.000960826311774295
Epoch 8, Training Loss: 0.0007577341447327415
Epoch 9, Training Loss: 0.0006048112710263452
Epoch 10, Training Loss: 0.000490230045498124


In [16]:
# Validation loop
model.eval()
total_val_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for batch in dataloader_val:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        total_val_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=1)
        total += batch['labels'].size(0)
        correct += (predictions == batch['labels']).sum().item()
avg_val_loss = total_val_loss / len(dataloader_val)
val_accuracy = correct / total
print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}")

Validation Loss: 0.00029091862961649895, Validation Accuracy: 1.0


In [17]:
# Save model
model.module.save_pretrained('/kaggle/working/ai_agent_model') if hasattr(model, 'module') else model.save_pretrained('/kaggle/working/ai_agent_model')
tokenizer.save_pretrained('/kaggle/working/ai_agent_model')
print("AI Agent model trained and saved!")

# Clean up
del refugee_data, op_data, ngo_data, merged_data, train_data, dataset_train, dataset_val
gc.collect()

AI Agent model trained and saved!


152

In [18]:
print(f"Validation set size: {len(texts_val)}")
print(f"Unique labels in validation: {np.unique(labels_val)}")
print(f"Sample validation texts: {texts_val[:5]}")  # First 5 examples

Validation set size: 459
Unique labels in validation: [0 1 2]
Sample validation texts: ['Health association pour le développement et le renforcement des actions humanitaires TCD', 'Education alsalam organization for rehabilitation and development SDN', 'Protection aisha association for woman and child protection PSE', 'Protection cooperazione internazionale TCD', 'Health rufaida health foundation SDN']


In [19]:
model.eval()
test_texts = ["Protection unicef in location_code_456", "Education save the children in location_code_789", 
              "Health who in location_code_101"]
inputs = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    for text, pred in zip(test_texts, predictions):
        print(f"Text: {text}, Predicted sector: {list(sector_map.keys())[pred.item()]}")

Text: Protection unicef in location_code_456, Predicted sector: Protection
Text: Education save the children in location_code_789, Predicted sector: Education
Text: Health who in location_code_101, Predicted sector: Health


### AI RECOMMENDER AGENT

In [20]:
import pandas as pd
import gc
from sentence_transformers import SentenceTransformer, util
import torch

# Load datasets
funding_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/fts_outgoing_funding_global.csv', 
                           low_memory=False, comment='#', 
                           dtype={'destOrganization': 'category', 'destOrganizationTypes': 'category', 
                                  'destGlobalClusters': 'category', 'destLocations': 'category'})
funding_data['amountUSD'] = pd.to_numeric(funding_data['amountUSD'], errors='coerce').astype('float32')

appeals_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/fts_requirements_funding_global.csv', 
                           low_memory=False, comment='#')
appeals_data['requirements'] = pd.to_numeric(appeals_data['requirements'], errors='coerce').astype('float32')
appeals_data['percentFunded'] = pd.to_numeric(appeals_data['percentFunded'], errors='coerce').astype('float32')

# Prepare text for embedding
# For funding: Combine destGlobalClusters, destLocations, and destOrganization
funding_texts = (funding_data['destGlobalClusters'].astype(str).fillna('') + " " + 
                 funding_data['destLocations'].astype(str).fillna('') + " " + 
                 funding_data['destOrganization'].astype(str).fillna('')).tolist()

# For appeals: Combine name, typeName, countryCode, and requirements
appeals_texts = (appeals_data['name'].astype(str).fillna('') + " " + 
                 appeals_data['typeName'].astype(str).fillna('') + " " + 
                 appeals_data['countryCode'].astype(str).fillna('') + " " + 
                 appeals_data['requirements'].astype(str).fillna('')).tolist()

In [21]:
# Load Sentence-BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)
print(f"Using device: {device}")

# Generate embeddings
funding_embeddings = model.encode(funding_texts, convert_to_tensor=True, device=device, batch_size=16)
appeals_embeddings = model.encode(appeals_texts, convert_to_tensor=True, device=device, batch_size=16)

# Compute cosine similarities
cosine_scores = util.cos_sim(funding_embeddings, appeals_embeddings)

# Find top matches (top 3 for each funding entry) and filter by funding criteria
top_k = min(3, len(appeals_texts))
for i in range(len(funding_texts)):
    top_results = torch.topk(cosine_scores[i], k=top_k)
    funding_amount = funding_data.iloc[i]['amountUSD']
    print(f"Funding Entry: {funding_texts[i]} (Amount: ${funding_amount:,.2f})")
    for score, idx in zip(top_results[0], top_results[1]):
        idx = idx.item()  # Convert tensor to integer
        appeal = appeals_data.iloc[idx]
        if appeal['percentFunded'] < 100 and funding_amount >= 0.1 * appeal['requirements']:
            print(f"  Match: {appeals_texts[idx]}, Score: {score.item():.4f}, Requirements: ${appeal['requirements']:,.2f}, Percent Funded: {appeal['percentFunded']:.2f}%")

# Clean up
del funding_data, appeals_data, funding_embeddings, appeals_embeddings
gc.collect()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using device: cuda


Batches:   0%|          | 0/143 [00:00<?, ?it/s]

Batches:   0%|          | 0/232 [00:00<?, ?it/s]

Funding Entry: Camp Coordination / Management,Emergency Shelter and NFI,Protection,Protection - Child Protection,Protection - Gender-Based Violence,Protection - Mine Action SDN United Nations High Commissioner for Refugees (Amount: $5,000,000.00)
Funding Entry: nan nan Pending distribution (Asia, Pacific HF) (Amount: $3,215,434.00)
Funding Entry: nan PHL Food and Agriculture Organization of the United Nations (Amount: $0.00)
Funding Entry: nan PHL Food and Agriculture Organization of the United Nations (Amount: $1,042,943.00)
Funding Entry: nan PHL Food and Agriculture Organization of the United Nations (Amount: $625,752.00)
Funding Entry: nan PHL Food and Agriculture Organization of the United Nations (Amount: $208,586.00)
Funding Entry: Education MMR United Nations Children's Fund (Amount: $990,753.00)
Funding Entry: nan FJI United Nations Children's Fund (Amount: $310,174.00)
Funding Entry: nan BRB United Nations Children's Fund (Amount: $61,652.00)
Funding Entry: Food Security MMR 

  if appeal['percentFunded'] < 100 and funding_amount >= 0.1 * appeal['requirements']:


Funding Entry: Food Security VNM Food and Agriculture Organization of the United Nations (Amount: $-1.00)
Funding Entry: Food Security VNM Food and Agriculture Organization of the United Nations (Amount: $569,245.00)
Funding Entry: Food Security VNM Food and Agriculture Organization of the United Nations (Amount: $569,246.00)
Funding Entry: Food Security VNM Food and Agriculture Organization of the United Nations (Amount: $142,311.00)
Funding Entry: Food Security VNM Food and Agriculture Organization of the United Nations (Amount: $142,312.00)
Funding Entry: nan AFG Afghanistan Humanitarian Fund (Amount: $1,384,133.00)
Funding Entry: nan COD Democratic Republic of the Congo Humanitarian Fund (Amount: $692,066.00)
Funding Entry: nan LBN Lebanon Humanitarian Fund (Amount: $726,670.00)
Funding Entry: nan SSD South Sudan Humanitarian Fund (Amount: $726,670.00)
Funding Entry: nan SDN Sudan Humanitarian Fund (Amount: $830,480.00)
Funding Entry: nan SYR Syrian Arab Republic Humanitarian Fund 

16