In [1]:
# Install required libraries (run once per session)
!pip install transformers sentence-transformers fuzzywuzzy python-Levenshtein prophet -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### AI AGENT FOR ROUTING

In [2]:
import pandas as pd
import gc
from fuzzywuzzy import process, fuzz
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

2025-06-01 08:39:19.913597: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748767160.135486      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748767160.198389      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Custom Dataset for batching
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Fuzzy matching function
def fuzzy_match_orgs(source, target, threshold=80):
    matches = {}
    for org in source['org_name'].dropna().unique():
        result = process.extractOne(org.lower().strip(), target['NGO Name'].str.lower().str.strip(), scorer=fuzz.partial_ratio)
        if result is not None:
            best_match, score, index = result
            if score >= threshold:
                matches[org] = best_match
        matches[org] = matches.get(org, org)
    return matches


In [4]:
# Load and preprocess datasets
refugee_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/data_3countries_refugees_public.csv', 
                           low_memory=False, dtype={'surveylocation': 'category', 'education_years3': 'float32', 'health': 'float32'})
op_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/hdx_hapi_operational_presence_global.csv', 
                      low_memory=False, dtype={'org_name': 'category', 'location_code': 'category', 'sector_name': 'category'})
ngo_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/NGO_List_with_Focus_Areas.csv', 
                       dtype={'NGO Name': 'category', 'Country': 'category', 'Area of Focus': 'category'})

In [5]:
# Apply fuzzy matching
op_data['org_name'] = op_data['org_name'].astype(str).str.lower().str.strip()
ngo_data['NGO Name'] = ngo_data['NGO Name'].astype(str).str.lower().str.strip()
org_matches = fuzzy_match_orgs(op_data, ngo_data)
op_data['matched_ngo'] = op_data['org_name'].map(lambda x: org_matches.get(x, x))

# Merge datasets
merged_data = pd.merge(op_data, ngo_data, left_on='matched_ngo', right_on='NGO Name', how='left')
merged_data = pd.merge(merged_data, refugee_data[['surveylocation', 'education_years3', 'health']], 
                      left_on='location_code', right_on='surveylocation', how='left')

# Remove duplicates to prevent data explosion
merged_data = merged_data.drop_duplicates(subset=['org_name', 'location_code', 'sector_name'])

print(merged_data['sector_name'])

0         #sector+name
1           Protection
2           Protection
3               Health
4               Health
             ...      
51823        Education
51832        Nutrition
51841        Education
51961    Food Security
51985    Food Security
Name: sector_name, Length: 6164, dtype: category
Categories (19, object): ['#sector+name', 'Camp Coordination / Management', 'Cash programming', 'Child Protection', ..., 'Multi-sector (unspecified)', 'Nutrition', 'Protection', 'Water Sanitation Hygiene']


In [6]:
merged_data.drop(0, axis = 0, inplace = True)

In [7]:
sector_names = merged_data['sector_name'].unique().tolist()
sector_map = {name:index for index,name in enumerate(sector_names)}

In [8]:
sector_map

{'Protection': 0,
 'Health': 1,
 'Nutrition': 2,
 'Emergency Shelter and NFI': 3,
 'Water Sanitation Hygiene': 4,
 'Food Security': 5,
 'Education': 6,
 'Gender Based Violence': 7,
 'Child Protection': 8,
 nan: 9,
 'Housing, Land and Property': 10,
 'Logistics': 11,
 'Camp Coordination / Management': 12,
 'Multi-sector (unspecified)': 13,
 'Emergency Telecommunications': 14,
 'Mine Action': 15,
 'Early Recovery': 16,
 'Cash programming': 17,
 'Humanitarian assistance (unspecified)': 18}

In [9]:
# Prepare data for DistilBERT
merged_data['sector_label'] = merged_data['sector_name'].map(lambda x: sector_map.get(x, -1))

In [10]:
# Filter for labeled data and split into train and validation sets
train_data = merged_data[merged_data['sector_label'] != 9].copy()
texts = (train_data['sector_name'].astype(str).fillna('') + " " + 
         train_data['org_name'].astype(str).fillna('') + " " + 
         train_data['location_code'].astype(str).fillna('')).tolist()
labels = train_data['sector_label'].tolist()
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [11]:
refugee_data.head(10)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,hhid_new,individualid_new,hh_dummy3,surveylocation,nationality_cat,context,strata3,su1,fpc_su1,weight_ind,...,skillednonmanual,otherwork,incentive,remittances_USD_w99,hhincome_ww,hhincome_ww_pc,hh_m_income_ALL_USD_w99_ww_pc,hh_remittances_USD_w99_ww_pc,hh_remittances_USD_w99_ww_d,hh_m_income_aidsupport_USD_pc
0,11100040,1110004001,Yes,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,13.625,...,,,,156.49449,771.29431,85.699371,31.795708,53.90366,1.0,0.0
1,11100040,1110004002,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,0.0,0.0,0.0,156.49449,,,,,,
2,11100040,1110004003,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,0.0,0.0,0.0,62.597797,,,,,,
3,11100040,1110004004,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,0.0,0.0,0.0,0.0,,,,,,
4,11100040,1110004005,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100040,654,20.4375,...,,,,0.0,,,,,,
5,11100043,1110004301,Yes,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,13.625,...,,,,312.98898,578.4707,72.308838,11.178178,61.130661,1.0,0.0
6,11100043,1110004302,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,,,,140.84505,,,,,,
7,11100043,1110004303,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,0.0,0.0,0.0,0.0,,,,,,
8,11100043,1110004304,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,,,,0.0,,,,,,
9,11100043,1110004305,No,Nairobi,Congolese,Kenya; urban; DRC,1011111,11100043,654,17.03125,...,,,,0.0,,,,,,


In [12]:
refugee_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8996 entries, 0 to 8995
Columns: 231 entries, hhid_new to hh_m_income_aidsupport_USD_pc
dtypes: category(1), float32(2), float64(46), int64(6), object(176)
memory usage: 15.7+ MB


In [13]:
refugee_data.describe()

Unnamed: 0,strata3,su1,fpc_su1,weight_ind,somali,location3_id,enumerator_loc_id,year_leave,year_arrive,years_hostsite,...,skillednonmanual,otherwork,incentive,remittances_USD_w99,hhincome_ww,hhincome_ww_pc,hh_m_income_ALL_USD_w99_ww_pc,hh_remittances_USD_w99_ww_pc,hh_remittances_USD_w99_ww_d,hh_m_income_aidsupport_USD_pc
count,8996.0,8996.0,8996.0,8972.0,8996.0,8996.0,8989.0,8906.0,8906.0,8906.0,...,2824.0,2824.0,2824.0,8967.0,3639.0,3631.0,3642.0,3645.0,3654.0,3650.0
mean,151855.1,1186265.0,1443.658181,27.532659,0.602601,132.939195,82.766826,2010.882551,2011.643611,5.913317,...,0.041431,0.036827,0.081445,30.850685,197.958816,44.158239,14.588708,20.137084,0.425835,9.336445
std,346749.9,3340397.0,1830.36983,37.075568,0.489387,76.248566,45.527359,5.315991,4.754747,4.734442,...,0.199319,0.188371,0.273565,84.447633,203.34679,57.74264,33.337021,48.73748,0.494537,10.889144
min,1111.0,101.0,1.0,1.0,0.0,1.0,2.0,1968.0,1968.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2012.0,1148.0,69.0,10.577586,0.0,79.0,44.0,2009.0,2010.0,3.0,...,0.0,0.0,0.0,0.0,81.889435,13.648239,0.0,0.0,0.0,0.0
50%,3011.0,10965.5,426.0,13.962656,1.0,101.0,78.0,2011.0,2012.0,6.0,...,0.0,0.0,0.0,0.0,136.48239,22.546541,1.340447,0.0,0.0,9.572502
75%,31117.0,76199.25,2941.0,42.155338,1.0,220.0,120.0,2014.0,2015.0,7.0,...,0.0,0.0,0.0,6.415094,233.380915,52.637836,14.588611,18.527424,1.0,13.648239
max,1012200.0,12500160.0,5759.0,625.15631,1.0,264.0,169.0,2018.0,2019.0,50.0,...,1.0,1.0,1.0,751.17358,2773.6785,751.17358,423.28033,751.17358,1.0,177.42711


In [14]:
refugee_data.columns.tolist()

['hhid_new',
 'individualid_new',
 'hh_dummy3',
 'surveylocation',
 'nationality_cat',
 'context',
 'strata3',
 'su1',
 'fpc_su1',
 'weight_ind',
 'somali',
 'urban',
 'location3',
 'location3_id',
 'enumerator_loc_id',
 'year_leave',
 'year_arrive',
 'years_hostsite',
 'age',
 'gender',
 'relationship3',
 'maritalstatus',
 'maritalstatus_married',
 'religion3',
 'education_years3',
 'educfather_years3',
 'educmother_years3',
 'vocational',
 'likert_local',
 'local_language',
 'likert_well_en',
 'english',
 'hf_hhh_gender',
 'hf_people',
 'hf_dependency',
 'hf_hhhistory_urban',
 'job',
 'findjob3',
 'm_income_TOT_ALL_3',
 'hhincome_ww_lcu',
 'hhincome_ww_lcu_pc',
 'wageincome_pc',
 'remit_pc',
 'activity3',
 'employee3',
 'remittances_monthly',
 'remittances_d',
 'assets',
 'assetshh_radio',
 'assetshh_television',
 'assetshh_refrigerator',
 'assetshh_solarpanel',
 'assetshh_table',
 'assetshh_chair',
 'assetshh_sofa',
 'assetshh_bed',
 'assetshh_cupboard',
 'assetshh_clock',
 'assetsi

In [15]:
op_data.head(10)

Unnamed: 0,location_code,has_hrp,in_gho,provider_admin1_name,provider_admin2_name,admin1_code,admin1_name,admin2_code,admin2_name,admin_level,...,org_type_description,sector_code,sector_name,reference_period_start,reference_period_end,dataset_hdx_id,resource_hdx_id,warning,error,matched_ngo
0,#country+code,#meta+has_hrp,#meta+in_gho,#adm1+name+provider,#adm2+name+provider,#adm1+code,#adm1+name,#adm2+code,#adm2+name,#adm+level,...,#org+type+desc,#sector+code,#sector+name,#date+start,#date+end,#meta+dataset_id,#meta+resource_id,#meta+warning,#meta+error,#org+name
1,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,National NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,afghanistan development & welfare services org...
2,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,aga khan agency for habitat
3,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International Organization,HEA,Health,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,hope foundation
4,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,HEA,Health,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,aga khan health service
5,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,NUT,Nutrition,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,aga khan health service
6,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,National NGO,NUT,Nutrition,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,bakhtar development network
7,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,National NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,coordination of humanitarian assistance
8,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,SHL,Emergency Shelter and NFI,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,concern worldwide
9,AFG,Y,Y,Badakhshan,Arghanj Khwah,AF17,Badakhshan,AF1703,Arghanj Khwah,2,...,International NGO,WSH,Water Sanitation Hygiene,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,,,concern worldwide


In [16]:
op_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44037 entries, 0 to 44036
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   location_code           44037 non-null  category
 1   has_hrp                 44037 non-null  object  
 2   in_gho                  44037 non-null  object  
 3   provider_admin1_name    43209 non-null  object  
 4   provider_admin2_name    42008 non-null  object  
 5   admin1_code             43737 non-null  object  
 6   admin1_name             43737 non-null  object  
 7   admin2_code             41696 non-null  object  
 8   admin2_name             41696 non-null  object  
 9   admin_level             44037 non-null  object  
 10  org_acronym             43848 non-null  object  
 11  org_name                44037 non-null  object  
 12  org_type_description    42100 non-null  object  
 13  sector_code             43407 non-null  object  
 14  sector_name           

In [17]:
op_data.describe()

Unnamed: 0,location_code,has_hrp,in_gho,provider_admin1_name,provider_admin2_name,admin1_code,admin1_name,admin2_code,admin2_name,admin_level,...,org_type_description,sector_code,sector_name,reference_period_start,reference_period_end,dataset_hdx_id,resource_hdx_id,warning,error,matched_ngo
count,44037,44037,44037,43209,42008,43737,43737,41696,41696,44037,...,42100,43407,43407,44037,44037,44037,44037,1836,643,44037
unique,27,3,2,427,2930,364,354,2983,2841,4,...,12,19,19,17,17,27,27,93,14,1918
top,AFG,Y,Y,Ta'iz,Sucre,YE15,Ta'iz,CM007005,Sucre,2,...,National NGO,PRO,Protection,2025-01-01,2025-03-31,4aef067d-0761-4977-9020-e83d8f6908a4,4986afde-52ba-4dd7-9f01-b7965ccc07ae,PCode length TD14->TCD14,Unknown sector Coord. & Log. & Support Services,interaction
freq,10934,41019,44036,1152,156,1152,1152,149,156,41864,...,15303,8979,8979,21735,18498,10934,10934,264,438,3168


In [18]:
# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(sector_map))

# Set up device and multi-GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)
model = model.to(device)
print(f"Using device: {device}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs!
Using device: cuda


In [19]:
# Create dataset and dataloader with batching
dataset_train = TextDataset(texts_train, labels_train, tokenizer)
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True)
dataset_val = TextDataset(texts_val, labels_val, tokenizer)
dataloader_val = DataLoader(dataset_val, batch_size=64)

# Training loop with proper loss aggregation
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [20]:
# Training loop with proper loss aggregation
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(10):
    total_loss = 0
    for batch in dataloader_train:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader_train)
    print(f"Epoch {epoch+1}, Training Loss: {avg_loss}")



Epoch 1, Training Loss: 0.8542921611821497
Epoch 2, Training Loss: 0.06060895647901993
Epoch 3, Training Loss: 0.022932876215933207
Epoch 4, Training Loss: 0.012929356288068093
Epoch 5, Training Loss: 0.007840710695131451
Epoch 6, Training Loss: 0.005276711833022245
Epoch 7, Training Loss: 0.004050230816030851
Epoch 8, Training Loss: 0.00318168193047455
Epoch 9, Training Loss: 0.00241051695019297
Epoch 10, Training Loss: 0.0019371151446376915


In [21]:
# Validation loop
model.eval()
total_val_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for batch in dataloader_val:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        total_val_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=1)
        total += batch['labels'].size(0)
        correct += (predictions == batch['labels']).sum().item()
avg_val_loss = total_val_loss / len(dataloader_val)
val_accuracy = correct / total
print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}")

Validation Loss: 0.001419169083237648, Validation Accuracy: 1.0


In [22]:
# Save model
model.module.save_pretrained('/kaggle/working/ai_agent_model') if hasattr(model, 'module') else model.save_pretrained('/kaggle/working/ai_agent_model')
tokenizer.save_pretrained('/kaggle/working/ai_agent_model')
print("AI Agent model trained and saved!")

# Clean up
del refugee_data, op_data, ngo_data, merged_data, train_data, dataset_train, dataset_val
gc.collect()

AI Agent model trained and saved!


23

In [23]:
print(f"Validation set size: {len(texts_val)}")
print(f"Unique labels in validation: {np.unique(labels_val)}")
print(f"Sample validation texts: {texts_val[:5]}")  # First 5 examples

Validation set size: 1224
Unique labels in validation: [ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17]
Sample validation texts: ['Food Security adventist development and relief agency TCD', 'Food Security al-takhadoum TCD', 'Emergency Shelter and NFI association des femmes allaintantes TCD', 'Nutrition première urgence internationale MLI', 'Protection hospice and palliative care association of zimbabwe (hospaz) ZWE']


In [24]:
model.eval()
test_texts = ["Protection unicef in location_code_456", "Education save the children in location_code_789", 
              "Health who in location_code_101"]
inputs = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    for text, pred in zip(test_texts, predictions):
        print(f"Text: {text}, Predicted sector: {list(sector_map.keys())[pred.item()]}")

Text: Protection unicef in location_code_456, Predicted sector: Protection
Text: Education save the children in location_code_789, Predicted sector: Education
Text: Health who in location_code_101, Predicted sector: Health


### AI RECOMMENDER AGENT

In [25]:
import pandas as pd
import gc
from sentence_transformers import SentenceTransformer, util
import torch
from fuzzywuzzy import fuzz

# Load datasets
funding_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/fts_outgoing_funding_global.csv', 
                           low_memory=False, comment='#', 
                           dtype={'destOrganization': 'category', 'destOrganizationTypes': 'category', 
                                  'destGlobalClusters': 'category', 'destLocations': 'category'})
funding_data['amountUSD'] = pd.to_numeric(funding_data['amountUSD'], errors='coerce').astype('float32')

appeals_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/fts_requirements_funding_global.csv', 
                           low_memory=False, comment='#')
appeals_data['requirements'] = pd.to_numeric(appeals_data['requirements'], errors='coerce').astype('float32')
appeals_data['percentFunded'] = pd.to_numeric(appeals_data['percentFunded'], errors='coerce').astype('float32')

# Filter out invalid entries
funding_data = funding_data[funding_data['destGlobalClusters'].notna()]
appeals_data = appeals_data[appeals_data['requirements'].notna() & appeals_data['percentFunded'].notna()]

# Filter appeals by recent years (2023-2025)
appeals_data = appeals_data[appeals_data['year'].astype(int).between(2023, 2025)]

# Prepare text for embedding
funding_texts = (funding_data['destGlobalClusters'].astype(str).fillna('Unknown Cluster') + " " + 
                 funding_data['destLocations'].astype(str).fillna('Unknown Location') + " " + 
                 funding_data['destOrganization'].astype(str).fillna('Unknown Organization') + " " + 
                 funding_data['description'].astype(str).fillna('')).tolist()

appeals_texts = (appeals_data['name'].astype(str).fillna('Unknown Appeal') + " " + 
                 appeals_data['typeName'].astype(str).fillna('Unknown Type') + " " + 
                 appeals_data['countryCode'].astype(str).fillna('Unknown Country') + " " + 
                 appeals_data['requirements'].astype(str).fillna('0')).tolist()

In [26]:
from fuzzywuzzy import fuzz

# Load Sentence-BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)
print(f"Using device: {device}")

# Generate embeddings
funding_embeddings = model.encode(funding_texts, convert_to_tensor=True, device=device, batch_size=16)
appeals_embeddings = model.encode(appeals_texts, convert_to_tensor=True, device=device, batch_size=16)

# Compute cosine similarities
cosine_scores = util.cos_sim(funding_embeddings, appeals_embeddings)

# Initialize counters
total_entries = 0
funded_matches = 0
closest_matches = 0
no_matches = 0

# Find top matches (top 1 for closest match, top 3 for funded matches)
top_k = min(3, len(appeals_texts))
for i in range(len(funding_texts)):
    top_results = torch.topk(cosine_scores[i], k=top_k)
    funding_amount = funding_data.iloc[i]['amountUSD']
    funding_location = funding_data.iloc[i]['destLocations']
    funding_org = funding_data.iloc[i]['destOrganization']
    total_entries += 1
    print(f"Funding Entry: {funding_texts[i]} (Amount: ${funding_amount:,.2f})")
    matches_found = False
    top_potential_idx = 0
    top_score = 0
    top_reason = ""
    for score, idx in zip(top_results[0], top_results[1]):
        idx = idx.item()
        appeal = appeals_data.iloc[idx]
        appeal_location = appeal['countryCode']
        appeal_org = appeal['name'].split()[0] if appeal['name'] else "Unknown"  # Simple org extraction
        location_match = str(funding_location) == str(appeal_location)
        # Fuzzy match for organizations
        org_similarity = fuzz.partial_ratio(str(funding_org).lower(), str(appeal_org).lower())
        org_match = org_similarity >= 80  # Threshold for organization match
        score_boost = 0.1 if location_match else 0
        adjusted_score = score.item() + score_boost
        # Check for funded matches
        if adjusted_score < 0.3:
            reason = "Low similarity score"
        elif appeal['percentFunded'] >= 100:
            reason = "Already fully funded"
        elif funding_amount < max(0.01 * appeal['requirements'], 100_000):
            reason = f"Funding too small: ${funding_amount:,.2f} vs min ${max(0.01 * appeal['requirements'], 100_000):,.2f}"
        else:
            matches_found = True
            funded_matches += 1
            print(f"  Funded Match: {appeals_texts[idx]} (Location Match: {location_match}, Org Similarity: {org_similarity}%), Score: {adjusted_score:.4f}, Requirements: ${appeal['requirements']:,.2f}, Percent Funded: {appeal['percentFunded']:.2f}%")
            continue
        # Track the top potential match for closest match
        if adjusted_score > top_score:
            top_score = adjusted_score
            top_potential_idx = idx
            top_reason = reason
    # Check for closest match (top scoring entry with org or location match)
    if not matches_found and top_potential_idx:
        appeal = appeals_data.iloc[top_potential_idx]
        appeal_location = appeal['countryCode']
        appeal_org = appeal['name'].split()[0] if appeal['name'] else "Unknown"
        location_match = str(funding_location) == str(appeal_location)
        org_similarity = fuzz.partial_ratio(str(funding_org).lower(), str(appeal_org).lower())
        org_match = org_similarity >= 80
        if top_score >= 0.3 and (location_match or org_match):
            closest_matches += 1
            print(f"  Closest Match: {appeals_texts[top_potential_idx]} (Location Match: {location_match}, Org Similarity: {org_similarity}%), Score: {top_score:.4f}, Requirements: ${appeal['requirements']:,.2f}, Percent Funded: {appeal['percentFunded']:.2f}%")
            matches_found = True
    if not matches_found:
        no_matches += 1
        if top_potential_idx:
            appeal = appeals_data.iloc[top_potential_idx]
            print(f"  No matches found. Closest potential: {appeals_texts[top_potential_idx]}, Score: {top_score:.4f}, Reason: {top_reason}")
        else:
            print("  No potential matches found.")

# Summary
print(f"\nSummary: Total Entries: {total_entries}, Funded Matches: {funded_matches}, Closest Matches: {closest_matches}, No Matches: {no_matches}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using device: cuda


Batches:   0%|          | 0/109 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Funding Entry: Camp Coordination / Management,Emergency Shelter and NFI,Protection,Protection - Child Protection,Protection - Gender-Based Violence,Protection - Mine Action SDN United Nations High Commissioner for Refugees Sudan Humanitarian Response Plan 2024 (Amount: $5,000,000.00)
  Closest Match: Sudan Humanitarian Needs and Response Plan 2024 Humanitarian needs and response plan SDN 2695680800.0 (Location Match: True, Org Similarity: 40%), Score: 0.8596, Requirements: $2,695,680,768.00, Percent Funded: 70.00%
Funding Entry: Education MMR United Nations Children's Fund Myanmar: Learning Together Phase II programme (Amount: $990,753.00)
  Closest Match: Myanmar Original Humanitarian Needs and Response Plan 2025 Humanitarian needs and response plan MMR 1137811600.0 (Location Match: True, Org Similarity: 29%), Score: 0.7297, Requirements: $1,137,811,584.00, Percent Funded: 8.00%
Funding Entry: Food Security MMR World Food Programme Food Security - Food Assistance (Amount: $4,962,779.0

In [27]:
# Clean up
del funding_data, appeals_data, funding_embeddings, appeals_embeddings
gc.collect()

48

### CRISIS ALERTS

In [28]:
import pandas as pd
import numpy as np
from scipy import stats
from geopy.distance import geodesic
import datetime

# Load datasets
refugee_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/data_3countries_refugees_public.csv', 
                           low_memory=False, dtype={'surveylocation': 'category', 'education_years3': 'float32', 'health': 'float32'})
appeals_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/fts_requirements_funding_global.csv', 
                           low_memory=False, comment='#')
appeals_data['requirements'] = pd.to_numeric(appeals_data['requirements'], errors='coerce').astype('float32')
appeals_data['percentFunded'] = pd.to_numeric(appeals_data['percentFunded'], errors='coerce').astype('float32')

# Load GDACS dataset
gdacs_data = pd.read_csv('/kaggle/input/refugeai-nexus-project-dataset/gdacs_rss_information.csv', 
                         low_memory=False, comment='#')

# Convert GDACS dates to datetime
gdacs_data['from_date'] = pd.to_datetime(gdacs_data['from_date'], errors='coerce', utc=True)
current_date = pd.to_datetime("2025-06-01 08:18:00", utc=True)
gdacs_data = gdacs_data[gdacs_data['from_date'] >= (current_date - pd.Timedelta(days=30))]  # Last 30 days

# Filter appeals for recent years (2023-2025)
appeals_data = appeals_data[appeals_data['year'].astype(int).between(2023, 2025)]

# Define the surveylocation to lat/lon mapping
location_coords = {
    'Addis': (9.03, 38.74),
    'Kakuma': (3.72, 34.86),
    'Kampala': (0.3177137, 32.5813539),
    'Melkadida': (4.5277801, 41.7250996),
    'Nairobi': (-1.2832533, 36.817245),
    'Nakivale': (-0.7818948, 30.9475798)
}

# Add lat/lon columns to refugee_data
refugee_data['lat'] = refugee_data['surveylocation'].map(lambda x: location_coords[x][0])
refugee_data['lon'] = refugee_data['surveylocation'].map(lambda x: location_coords[x][1])

# Analyze refugee data trends
refugee_stats = refugee_data.groupby('surveylocation', observed=True).agg({
    'health': 'mean',
    'education_years3': 'mean',
    'surveylocation': 'count',
    'lat': 'first',
    'lon': 'first'
}).rename(columns={'surveylocation': 'refugee_count', 'health': 'avg_health', 'education_years3': 'avg_education'})

# Debug: Print refugee_stats to verify lat/lon
print("Refugee Stats with Coordinates:")
print(refugee_stats[['lat', 'lon']])

# Add z-scores for anomaly detection
refugee_stats['health_zscore'] = np.abs(stats.zscore(refugee_stats['avg_health'].fillna(refugee_stats['avg_health'].mean())))
refugee_stats['education_zscore'] = np.abs(stats.zscore(refugee_stats['avg_education'].fillna(refugee_stats['avg_education'].mean())))
refugee_stats['count_zscore'] = np.abs(stats.zscore(refugee_stats['refugee_count']))

Refugee Stats with Coordinates:
                     lat        lon
surveylocation                     
Addis           9.030000  38.740000
Kakuma          3.720000  34.860000
Kampala         0.317714  32.581354
Melkadida       4.527780  41.725100
Nairobi        -1.283253  36.817245
Nakivale       -0.781895  30.947580


In [29]:
# Flag locations with high anomalies
crisis_threshold = 1.5
crisis_locations = refugee_stats[
    (refugee_stats['health_zscore'] > crisis_threshold) |
    (refugee_stats['education_zscore'] > crisis_threshold) |
    (refugee_stats['count_zscore'] > crisis_threshold)
]

# Filter GDACS events for East Africa
east_africa_countries = ['Kenya', 'Ethiopia', 'Uganda', 'Nigeria', 'The Democratic Republic of Congo']
gdacs_data_east_africa = gdacs_data[gdacs_data['country'].isin(east_africa_countries)]

# Process East African GDACS events
print("\nProcessing East African GDACS Events:")
for _, gdacs_event in gdacs_data_east_africa.iterrows():
    event_id = gdacs_event['id']
    event_type = gdacs_event['event_type']
    severity = gdacs_event['severity_value']
    lat = gdacs_event['geo_lat']
    lon = gdacs_event['geo_long']
    country_code = gdacs_event['iso3'] if pd.notna(gdacs_event['iso3']) else 'Unknown'
    summary = gdacs_event['summary']
    deaths = gdacs_event.get('deaths', 0) if pd.notna(gdacs_event.get('deaths', 0)) else 0
    displaced = gdacs_event.get('displaced', 0) if pd.notna(gdacs_event.get('displaced', 0)) else 0
    
    # Dynamic radius: 1000 km for high-impact events (>50 deaths or >500 displaced), otherwise 500 km
    radius = 1000 if deaths > 50 or displaced > 500 else 500
    
    print(f"\nGDACS Alert: {event_type} (ID: {event_id}) - Severity: {severity}M, Location: ({lat}, {lon})")
    print(f"  Summary: {summary}")
    print(f"  Radius used: {radius} km (Deaths: {deaths}, Displaced: {displaced})")

    nearby_locations = []
    for location in refugee_stats.index:
        location_lat = refugee_stats.loc[location, 'lat']
        location_lon = refugee_stats.loc[location, 'lon']
        try:
            distance = geodesic((lat, lon), (location_lat, location_lon)).km
            print(f"  Debug: Distance from {location} ({location_lat}, {location_lon}) to event: {distance:.2f} km")
            if distance <= radius:
                nearby_locations.append((location, distance))
        except ValueError as e:
            print(f"  Warning: Could not calculate distance for {location} due to invalid coordinates: {e}")
            continue

    # Store distances for visualization (specifically for Kenyan flood)
    if event_id == "FL1103273":  # Kenyan flood
        kenyan_flood_distances = [(loc, dist) for loc, dist in zip(refugee_stats.index, [
            geodesic((lat, lon), (refugee_stats.loc[loc, 'lat'], refugee_stats.loc[loc, 'lon'])).km
            for loc in refugee_stats.index
        ])]

    if nearby_locations:
        print(f"  Nearby Refugee Locations (within {radius} km):")
        for location, distance in nearby_locations:
            health_zscore = refugee_stats.loc[location, 'health_zscore']
            education_zscore = refugee_stats.loc[location, 'education_zscore']
            count_zscore = refugee_stats.loc[location, 'count_zscore']
            print(f"  - {location}: Distance: {distance:.2f} km, Health Z-Score: {health_zscore:.2f}, Education Z-Score: {education_zscore:.2f}, Refugee Count Z-Score: {count_zscore:.2f}")
            
            location_appeals = appeals_data[appeals_data['countryCode'] == country_code]
            if location_appeals.empty:
                print(f"  Alert: {location} - High Risk due to {event_type} (No Appeals Data)")
                print("    Recommendation: Initiate appeal for this location due to lack of funding data.")
            else:
                avg_percent_funded = location_appeals['percentFunded'].mean()
                # Escalate to High Risk if displaced > 100, regardless of funding
                if displaced > 100 or avg_percent_funded < 65:  # Adjusted to 65%
                    print(f"  Alert: {location} - High Risk due to {event_type} {'(Displaced > 100)' if displaced > 100 else '(Underfunded)'}")
                    print(f"    Average Percent Funded: {avg_percent_funded:.2f}%")
                    print("    Recommendation: Increase funding allocation for this location.")
                else:
                    print(f"  Alert: {location} - At Risk due to {event_type} (Funded but Monitor)")
                    print(f"    Average Percent Funded: {avg_percent_funded:.2f}%")
                    print("    Recommendation: Monitor closely and ensure funding is effectively utilized.")
    else:
        print(f"  No nearby refugee locations within {radius} km.")

# Process remaining GDACS events
print("\nProcessing Other GDACS Events:")
for _, gdacs_event in gdacs_data[~gdacs_data['country'].isin(east_africa_countries)].iterrows():
    event_id = gdacs_event['id']
    event_type = gdacs_event['event_type']
    severity = gdacs_event['severity_value']
    lat = gdacs_event['geo_lat']
    lon = gdacs_event['geo_long']
    country_code = gdacs_event['iso3'] if pd.notna(gdacs_event['iso3']) else 'Unknown'
    summary = gdacs_event['summary']
    deaths = gdacs_event.get('deaths', 0) if pd.notna(gdacs_event.get('deaths', 0)) else 0
    displaced = gdacs_event.get('displaced', 0) if pd.notna(gdacs_event.get('displaced', 0)) else 0
    
    radius = 1000 if deaths > 50 or displaced > 500 else 500
    
    print(f"\nGDACS Alert: {event_type} (ID: {event_id}) - Severity: {severity}M, Location: ({lat}, {lon})")
    print(f"  Summary: {summary}")

    nearby_locations = []
    for location in refugee_stats.index:
        location_lat = refugee_stats.loc[location, 'lat']
        location_lon = refugee_stats.loc[location, 'lon']
        try:
            distance = geodesic((lat, lon), (location_lat, location_lon)).km
            if distance <= radius:
                nearby_locations.append((location, distance))
        except ValueError as e:
            print(f"  Warning: Could not calculate distance for {location} due to invalid coordinates: {e}")
            continue

    if nearby_locations:
        print(f"  Nearby Refugee Locations (within {radius} km):")
        for location, distance in nearby_locations:
            health_zscore = refugee_stats.loc[location, 'health_zscore']
            education_zscore = refugee_stats.loc[location, 'education_zscore']
            count_zscore = refugee_stats.loc[location, 'count_zscore']
            print(f"  - {location}: Distance: {distance:.2f} km, Health Z-Score: {health_zscore:.2f}, Education Z-Score: {education_zscore:.2f}, Refugee Count Z-Score: {count_zscore:.2f}")
            
            location_appeals = appeals_data[appeals_data['countryCode'] == country_code]
            if location_appeals.empty:
                print(f"  Alert: {location} - High Risk due to {event_type} (No Appeals Data)")
                print("    Recommendation: Initiate appeal for this location due to lack of funding data.")
            else:
                avg_percent_funded = location_appeals['percentFunded'].mean()
                if displaced > 100 or avg_percent_funded < 65:
                    print(f"  Alert: {location} - High Risk due to {event_type} {'(Displaced > 100)' if displaced > 100 else '(Underfunded)'}")
                    print(f"    Average Percent Funded: {avg_percent_funded:.2f}%")
                    print("    Recommendation: Increase funding allocation for this location.")
                else:
                    print(f"  Alert: {location} - At Risk due to {event_type} (Funded but Monitor)")
                    print(f"    Average Percent Funded: {avg_percent_funded:.2f}%")
                    print("    Recommendation: Monitor closely and ensure funding is effectively utilized.")
    else:
        print(f"  No nearby refugee locations within {radius} km.")

# Additional alerts for crisis locations
for location in crisis_locations.index:
    if 'nearby_locations' not in locals() or not any(loc[0] == location for loc in nearby_locations):
        location_appeals = appeals_data[appeals_data['countryCode'] == location]
        if location_appeals.empty:
            print(f"\nAlert: {location} - Potential Crisis (No Appeals Data)")
            print(f"  Health Z-Score: {crisis_locations.loc[location, 'health_zscore']:.2f}, Education Z-Score: {crisis_locations.loc[location, 'education_zscore']:.2f}, Refugee Count Z-Score: {crisis_locations.loc[location, 'count_zscore']:.2f}")
            print("  Recommendation: Initiate appeal for this location due to lack of funding data.")
        elif location_appeals['percentFunded'].mean() < 65:
            print(f"\nAlert: {location} - Potential Crisis (Underfunded)")
            print(f"  Health Z-Score: {crisis_locations.loc[location, 'health_zscore']:.2f}, Education Z-Score: {crisis_locations.loc[location, 'education_zscore']:.2f}, Refugee Count Z-Score: {crisis_locations.loc[location, 'count_zscore']:.2f}")
            print(f"  Average Percent Funded: {location_appeals['percentFunded'].mean():.2f}%")
            print("  Recommendation: Increase funding allocation for this location.")
    


Processing East African GDACS Events:

GDACS Alert: Flood (ID: FL1103303) - Severity: 0.0M, Location: (9.2957202, 5.0544281)
  Summary: On 28/05/2025, a flood started in Nigeria, lasting until 30/05/2025 (last update). The flood caused 97 deaths and 0 displaced .
  Radius used: 500 km (Deaths: 0, Displaced: 0)
  Debug: Distance from Addis (9.03, 38.74) to event: 3701.04 km
  Debug: Distance from Kakuma (3.72, 34.86) to event: 3352.06 km
  Debug: Distance from Kampala (0.3177137, 32.5813539) to event: 3207.70 km
  Debug: Distance from Melkadida (4.5277801, 41.7250996) to event: 4084.60 km
  Debug: Distance from Nairobi (-1.2832533, 36.817245) to event: 3711.13 km
  Debug: Distance from Nakivale (-0.7818948, 30.9475798) to event: 3079.34 km
  No nearby refugee locations within 500 km.

GDACS Alert: Wildfire (ID: WF1023915) - Severity: 6532.0M, Location: (-1.4152222041245606, 29.23917053396887)
  Summary: On 23/05/2025, a forest fire started in The Democratic Republic of Congo,  until 30