In [1]:
# 1) Imports dependencies
import os
import math
import random
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import joblib
print('Environment ready. pandas:', pd.__version__, 'numpy:', np.__version__)


Environment ready. pandas: 2.2.2 numpy: 2.0.2


In [2]:
# 2) Helper functions: haversine and feature engineering
EARTH_RADIUS_KM = 6371.0

def haversine_km(lat1, lon1, lat2, lon2):
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi / 2.0) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2.0) ** 2
    return 2 * EARTH_RADIUS_KM * math.asin(math.sqrt(a))

def build_features_for_model(df):
    d = df.copy()
    d['alert_timestamp'] = pd.to_datetime(d['alert_timestamp'])
    d['last_donation_date'] = pd.to_datetime(d['last_donation_date'])
    d['days_since_last_donation'] = (d['alert_timestamp'] - d['last_donation_date']).dt.days.clip(lower=0)
    d['is_available'] = (d['availability_status'].astype(str).str.lower() == 'yes').astype(int)
    d['blood_type_match'] = (d['blood_group_donor'] == d['blood_type_needed']).astype(int)
    d['log_donations_365'] = np.log1p(d['donations_last_365d'])
    d['log_distance'] = np.log1p(d['distance_km'])
    d['hour'] = d['alert_timestamp'].dt.hour
    d['weekday'] = d['alert_timestamp'].dt.weekday
    urgency_map = {'Low':0, 'Medium':1, 'High':2, 'Critical':3}
    d['urgency_num'] = d['urgency'].map(urgency_map).fillna(0).astype(int)
    X = d[['age','blood_type_match','distance_km','log_distance','days_since_last_donation',
           'log_donations_365','past_response_rate','is_available','urgency_num','hour','weekday']]
    X = X.fillna(0)

In [3]:
# 3) Simulate a small dataset (fast) - donors, hospitals, and historical alerts
random.seed(42)
np.random.seed(42)

# Parameters (you can increase n_donors and days_history for larger experiments)
n_donors = 500
n_hospitals = 4
days_history = 90  # days of historical alerts

city_center = (22.5726, 88.3639)  # example center (Kolkata)
blood_groups = ['A+','A-','B+','B-','O+','O-','AB+','AB-']
genders = ['M','F','O']
urgencies = ['Low','Medium','High','Critical']

# Create hospitals
hospitals = []
for i in range(n_hospitals):
    lat = city_center[0] + random.uniform(-0.06, 0.06)
    lon = city_center[1] + random.uniform(-0.06, 0.06)
    hospitals.append({'hospital_id': i+1, 'lat':lat, 'lon':lon, 'name':f'Hospital_{i+1}'})

hosp_df = pd.DataFrame(hospitals)

# Create donors
donors = []
for i in range(n_donors):
    lat = city_center[0] + random.uniform(-0.12, 0.12)
    lon = city_center[1] + random.uniform(-0.12, 0.12)
    last_donation = datetime.utcnow() - timedelta(days=random.randint(30, 720))
    donations_last_365 = random.choices([0,1,2,3,4,5], weights=[30,30,20,10,8,2])[0]
    pr = min(1.0, max(0.0, np.random.beta(2,5)))
    donors.append({'donor_id': i+1, 'lat':lat, 'lon':lon, 'age':random.randint(18,65),
                   'gender':random.choice(genders), 'blood_group':random.choice(blood_groups),
                   'last_donation_date': last_donation, 'donations_last_365d': donations_last_365,
                   'availability_status': random.choice(['Yes']*8 + ['No']*2),
                   'past_response_rate': round(pr,3)})

donors_df = pd.DataFrame(donors)
print('Donors:', len(donors_df), 'Hospitals:', len(hosp_df))

Donors: 500 Hospitals: 4


In [4]:
# 4) Generate historical blood requests and donor-alert pairs with simulated responses
pairs = []
for day_offset in range(days_history, 0, -1):
    date = datetime.utcnow() - timedelta(days=day_offset)
    for _ in range(random.randint(0,3)):
        hosp = hosp_df.sample(1).iloc[0]
        req_bg = random.choice(blood_groups)
        urgency = random.choices(urgencies, weights=[40,30,20,10])[0]
        candidate_ids = donors_df.sample(min(80, len(donors_df))).index
        for idx in candidate_ids:
            donor = donors_df.loc[idx]
            dist = haversine_km(hosp['lat'], hosp['lon'], donor['lat'], donor['lon'])
            base = 0.05
            if donor['blood_group'] == req_bg:
                base += 0.25
            base += max(0, (10 - dist) * 0.02)
            days_since = (date - donor['last_donation_date']).days
            if 30 < days_since < 365:
                base += 0.05
            base += donor['past_response_rate'] * 0.2
            if donor['availability_status'] == 'Yes':
                base += 0.05
            if urgency == 'Critical':
                base += 0.05
            base = min(0.98, max(0.0, base))
            responded = random.random() < base
            response_time_hours = random.uniform(0.5,24.0) if responded else None
            pairs.append({
                'alert_timestamp': date,
                'hospital_id': hosp['hospital_id'],
                'hospital_lat': hosp['lat'],
                'hospital_lon': hosp['lon'],
                'donor_id': donor['donor_id'],
                'lat': donor['lat'],
                'lon': donor['lon'],
                'age': donor['age'],
                'gender': donor['gender'],
                'blood_group_donor': donor['blood_group'],
                'blood_type_needed': req_bg,
                'last_donation_date': donor['last_donation_date'],
                'donations_last_365d': donor['donations_last_365d'],
                'availability_status': donor['availability_status'],
                'past_response_rate': donor['past_response_rate'],
                'urgency': urgency,
                'distance_km': dist,
                'responded': int(responded)
            })

pairs_df = pd.DataFrame(pairs)
print('Generated donor-alert pairs:', len(pairs_df))


Generated donor-alert pairs: 10320


In [6]:

import numpy as np
import pandas as pd
import math
from datetime import datetime

In [18]:
def build_features_for_model(df):
    df = df.copy()

# --- Updated build_features_for_model ---
def build_features_for_model(df):
    df = df.copy()

    # Encode categorical gender
    if 'gender' in df.columns:
        df['gender_encoded'] = df['gender'].astype('category').cat.codes
    else:
        df['gender_encoded'] = 0

    # Encode categorical blood_type
    if 'blood_type' in df.columns:
        df['blood_type_encoded'] = df['blood_type'].astype('category').cat.codes
    else:
        df['blood_type_encoded'] = 0

    # Urgency level encoding
    if 'urgency_level' in df.columns:
        df['urgency_encoded'] = df['urgency_level'].astype('category').cat.codes
    else:
        df['urgency_encoded'] = 0

    # Days since last donation
    if 'last_donation_date' in df.columns:
        df['last_donation_days_ago'] = (pd.to_datetime(df['alert_timestamp']) -
                                        pd.to_datetime(df['last_donation_date'])).dt.days.fillna(9999)
    else:
        df['last_donation_days_ago'] = 9999

    # Fill missing numerical features
    for col in ['age', 'distance_km', 'donation_frequency_per_year', 'past_response_rate']:
        if col not in df.columns:
            df[col] = 0
        df[col] = df[col].fillna(0)

    feature_cols = [
        'age',
        'gender_encoded',
        'blood_type_encoded',
        'distance_km',
        'last_donation_days_ago',
        'donation_frequency_per_year',
        'past_response_rate',
        'urgency_encoded'
    ]
    return df[feature_cols]


In [19]:

pairs_df['alert_timestamp'] = pd.to_datetime(pairs_df['alert_timestamp'], errors='coerce')

# Drop bad timestamps if any
n_bad_ts = pairs_df['alert_timestamp'].isna().sum()
if n_bad_ts > 0:
    print(f"Warning: {n_bad_ts} rows have invalid 'alert_timestamp' and will be dropped.")
    pairs_df = pairs_df.dropna(subset=['alert_timestamp']).reset_index(drop=True)


In [20]:
# Sort chronologically
pairs_df = pairs_df.sort_values('alert_timestamp').reset_index(drop=True)


In [21]:
# Ensure label exists
if 'responded' not in pairs_df.columns:
    raise KeyError("pairs_df is missing required column 'responded' (the label).")
pairs_df['label'] = pairs_df['responded'].astype(int)


In [22]:
# Build features
X = build_features_for_model(pairs_df)
if X is None or not hasattr(X, 'iloc'):
    raise TypeError("build_features_for_model returned None or a non-DataFrame.")

In [23]:
# Target labels
y = pairs_df['label'].reset_index(drop=True)


In [24]:
# Train/test split (80/20, time-based)
split_idx = int(len(pairs_df) * 0.8)
if split_idx < 1 or split_idx >= len(pairs_df):
    raise ValueError("Not enough rows to split into train/test.")

In [25]:
X_train = X.iloc[:split_idx].reset_index(drop=True)
X_test  = X.iloc[split_idx:].reset_index(drop=True)
y_train = y.iloc[:split_idx].reset_index(drop=True)
y_test  = y.iloc[split_idx:].reset_index(drop=True)

In [26]:
# Diagnostics
print(f"Total rows: {len(pairs_df)}")
print(f"Train rows: {len(X_train)}  |  Test rows: {len(X_test)}")
print("Feature columns:", list(X_train.columns))
print("Any NaNs in X_train?", X_train.isna().any().any())

display(X_train.head())
display(y_train.head())

Total rows: 10320
Train rows: 8256  |  Test rows: 2064
Feature columns: ['age', 'gender_encoded', 'blood_type_encoded', 'distance_km', 'last_donation_days_ago', 'donation_frequency_per_year', 'past_response_rate', 'urgency_encoded']
Any NaNs in X_train? False


Unnamed: 0,age,gender_encoded,blood_type_encoded,distance_km,last_donation_days_ago,donation_frequency_per_year,past_response_rate,urgency_encoded
0,41,0,0,13.641309,44,0,0.231,0
1,37,2,0,9.850419,-7,0,0.264,0
2,33,0,0,14.576033,394,0,0.063,0
3,33,2,0,10.996911,356,0,0.108,0
4,37,0,0,19.602812,473,0,0.398,0


Unnamed: 0,label
0,0
1,1
2,0
3,1
4,0


In [27]:
# 6) Train Logistic Regression and Random Forest; evaluate
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_scaled, y_train)
probs_lr = lr.predict_proba(X_test_scaled)[:,1]
auc_lr = roc_auc_score(y_test, probs_lr)
ap_lr = average_precision_score(y_test, probs_lr)

# Random Forest (no scaling needed)
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
probs_rf = rf.predict_proba(X_test)[:,1]
auc_rf = roc_auc_score(y_test, probs_rf)
ap_rf = average_precision_score(y_test, probs_rf)

def precision_at_k(y_true, y_scores, k=5):
    order = np.argsort(-y_scores)
    topk = order[:k]
    return y_true.iloc[topk].sum() / float(k)

p5_lr = precision_at_k(y_test.reset_index(drop=True), pd.Series(probs_lr), 5)
p5_rf = precision_at_k(y_test.reset_index(drop=True), pd.Series(probs_rf), 5)

print(f'LogReg AUC={auc_lr:.4f} AP={ap_lr:.4f} P@5={p5_lr:.4f}')
print(f'RF     AUC={auc_rf:.4f} AP={ap_rf:.4f} P@5={p5_rf:.4f}')


LogReg AUC=0.5688 AP=0.2995 P@5=0.6000
RF     AUC=0.5428 AP=0.2792 P@5=0.6000


In [28]:
# 7) Choose best by average precision and save pipeline
model_dir = '/mnt/data/models'
os.makedirs(model_dir, exist_ok=True)

if ap_rf >= ap_lr:
    best_model = rf
    best_name = 'random_forest'
    best_probs = probs_rf
else:
    best_model = lr
    best_name = 'logistic_regression'
    best_probs = probs_lr

artifact = {'model': best_model, 'scaler': scaler, 'feature_columns': list(X.columns)}
joblib.dump(artifact, os.path.join(model_dir, f'{best_name}_pipeline.joblib'))
print('Saved best model pipeline to', os.path.join(model_dir, f'{best_name}_pipeline.joblib'))

Saved best model pipeline to /mnt/data/models/logistic_regression_pipeline.joblib


In [29]:
# 8) Minimal inference demo: rank candidates for a new request
artifact = joblib.load('/mnt/data/models/' + best_name + '_pipeline.joblib')
model = artifact['model']
scaler = artifact['scaler']
feat_cols = artifact['feature_columns']

new_hosp = hosp_df.sample(1).iloc[0]
req_bg = random.choice(blood_groups)
alert_time = datetime.utcnow()

candidates = donors_df.sample(200).copy().reset_index(drop=True)
candidates['blood_type_needed'] = req_bg
candidates['alert_timestamp'] = alert_time
candidates['distance_km'] = candidates.apply(lambda r: haversine_km(new_hosp['lat'], new_hosp['lon'], r['lat'], r['lon']), axis=1)
candidates['blood_group_donor'] = candidates['blood_group']
candidates['urgency'] = 'Critical'
candidates['last_donation_date'] = candidates['last_donation_date']

X_new = build_features_for_model(candidates)
X_new_scaled = scaler.transform(X_new)
scores = model.predict_proba(X_new_scaled)[:,1] if hasattr(model, 'predict_proba') else model.predict(X_new_scaled)
candidates['score'] = scores
top = candidates.sort_values('score', ascending=False).head(10)[['donor_id','blood_group','distance_km','past_response_rate','availability_status','score']]
print('Top candidate donors for', req_bg, 'at hospital', new_hosp['name'])
top


Top candidate donors for AB- at hospital Hospital_4


Unnamed: 0,donor_id,blood_group,distance_km,past_response_rate,availability_status,score
66,194,AB-,4.34328,0.724,Yes,0.68819
59,431,AB+,1.011897,0.562,No,0.668038
136,55,A+,2.268506,0.461,Yes,0.659357
62,262,AB+,4.786185,0.547,Yes,0.642803
74,122,AB+,3.216165,0.539,Yes,0.642335
139,470,O+,4.64832,0.462,Yes,0.639609
71,156,A-,6.719357,0.579,Yes,0.639402
99,88,O-,0.584165,0.305,Yes,0.62806
126,401,AB+,2.640588,0.406,Yes,0.623091
86,202,AB-,2.541377,0.464,Yes,0.620497
