In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans


In [2]:
# Charger le jeu de données
df = pd.read_csv("data/data_2019_cleaned.csv")

df.drop(columns=["Taille"], inplace=True)

In [3]:
df_eligible = df[df["ELIGIBILITE AU DON."] == "Eligible"].copy()

In [4]:
profession_groups = {
    "Chaudronnier": "Industrie & BTP", "Soudeur": "Industrie & BTP", "Mecanicien": "Industrie & BTP",
    "Macon": "Industrie & BTP", "Technicien en metallurgie": "Industrie & BTP", "Technicien genie civil": "Industrie & BTP",
    "Electrotechnicien": "Industrie & BTP", "Electricien en batiment": "Industrie & BTP", "Peintre": "Industrie & BTP",
    "Plombier": "Industrie & BTP", "Menuisier": "Industrie & BTP", "Carreleur": "Industrie & BTP",
    "Decorateur batiment": "Industrie & BTP", "Technicien etancheite": "Industrie & BTP",

    "Commercant": "Commerce & Entrepreneuriat", "Negociant bois": "Commerce & Entrepreneuriat",
    "Vendeur": "Commerce & Entrepreneuriat", "Entrepreneur": "Commerce & Entrepreneuriat",
    "Business man": "Commerce & Entrepreneuriat", "Trader": "Commerce & Entrepreneuriat",
    "Agent commercial": "Commerce & Entrepreneuriat", "Agent immobilier": "Commerce & Entrepreneuriat",
    "Restaurateur": "Commerce & Entrepreneuriat", "Magasinier": "Commerce & Entrepreneuriat",

    "Chauffeur": "Transport & Logistique", "Machiniste": "Transport & Logistique", "Docker": "Transport & Logistique",
    "Grutier": "Transport & Logistique", "Logisticien": "Transport & Logistique", "Transitaire": "Transport & Logistique",
    "Agent fret airport": "Transport & Logistique", "Gestionnaire de vols": "Transport & Logistique",
    "Conducteur": "Transport & Logistique",

    "Secretaire comptable": "Administration & Gestion", "Comptable": "Administration & Gestion",
    "Comptable financier": "Administration & Gestion", "Gestionnaire": "Administration & Gestion",
    "Assistant administratif": "Administration & Gestion", "Auditeur interne": "Administration & Gestion",
    "Administrateur": "Administration & Gestion", "Charge de clientele": "Administration & Gestion",
    "Charge de communication": "Administration & Gestion", "Intendant infirmier superieur": "Administration & Gestion",

    "Informaticien": "Informatique & Telecommunications", "Developpeur en informatique": "Informatique & Telecommunications",
    "Technicien reseaux telecoms": "Informatique & Telecommunications", "Analyste-programmeur": "Informatique & Telecommunications",
    "Informaticien de reseau": "Informatique & Telecommunications", "Infographe": "Informatique & Telecommunications",
    "Content manager": "Informatique & Telecommunications",

    "Enseignant": "Education & Recherche", "Professeur": "Education & Recherche",
    "Etudiant": "Education & Recherche", "Eleve": "Education & Recherche", "Stagiaire": "Education & Recherche",
    "Assistant juridique": "Education & Recherche",

    "Agent de securite": "Securite & Defense", "Chef de securite": "Securite & Defense",
    "Gendarme": "Securite & Defense", "Militaire": "Securite & Defense", "Brancardier": "Securite & Defense",

    "Medecin": "Sante & Social", "Personnel de sante": "Sante & Social",
    "Technicien de laboratoire": "Sante & Social", "Aide chirurgien": "Sante & Social",
    "Assistant infirmier": "Sante & Social", "Intendant infirmier superieur": "Sante & Social",

    "Beat maker": "Art & Culture", "Realisateur": "Art & Culture",
    "Chantre musicien": "Art & Culture", "Serigraphe": "Art & Culture", "Coiffeur": "Art & Culture",

    "Agent d'entretien": "Services & Autres", "Agent technique": "Services & Autres",
    "Technicien": "Services & Autres", "Electricien": "Services & Autres", "Hotelier": "Services & Autres",
    "Patissier": "Services & Autres", "Agent de maintenance industrielle": "Services & Autres",
    "Employe": "Services & Autres", "Operateur economique": "Services & Autres",

    "Sans emploi": "Sans emploi & Divers", "Pas precise": "Sans emploi & Divers"
}

# Appliquer le regroupement
df_eligible["Profession_Groupe"] = df_eligible["Profession"].map(profession_groups).fillna("Autres")


In [None]:
# Preprocessed data for ML Model
def preprocessed_data(df):
    data = df.copy()
    # Drop unwanted columns
    columns_to_drop = ['height', 'last_donation_date', 'submission_status','last_menstrual_date','other_total_ineligible_reasons']
    existing_cols_to_drop = [col for col in columns_to_drop if col in data.columns]
    print("  Columns to drop found in data:", existing_cols_to_drop)
    data = data.drop(columns=existing_cols_to_drop)

    # Convertion of date and creation of a new columns to store "days_since_last_don"
    data['form_fill_date'] = pd.to_datetime(data['form_fill_date'], errors='coerce')
    if 'last_donation_date' in data.columns:
        data['last_donation_date'] = pd.to_datetime(data['last_donation_date'], errors='coerce')
        data['days_since_last_don'] = (data['form_fill_date'] - data['last_donation_date']).dt.days
        data['days_since_last_don'] = data['days_since_last_don'].apply(lambda x: 9999 if x < 0 or pd.isna(x) else x)
    
    target = 'eligibility'
    X = data.drop(target, axis=1, inplace=True)
    y = data[target].map({'Eligible': 1, 'Temporairement Non-eligible': 2, 'Definitivement non-eligible':3}).fillna(0).astype(int)
    
    binary_features = ["has_donated_before", "ineligible_antibiotics",
                      "ineligible_low_hemoglobin", "ineligible_recent_donation",
                      "ineligible_recent_sti", "female_ineligible_menstrual",
                      "female_ineligible_breastfeeding", "female_ineligible_postpartum",
                      "female_ineligible_miscarriage", "female_ineligible_pregnant",
                      "total_ineligible_transfusion_history",
                      "total_ineligible_hiv_hbs_hcv",
                      "total_ineligible_surgery", "total_ineligible_sickle_cell",
                      "total_ineligible_diabetes", "total_ineligible_hypertension",
                      "total_ineligible_asthma", "total_ineligible_heart_disease",
                      "total_ineligible_tattoo", "total_ineligible_scarification"]
    for col in binary_features:
        if col in X.columns:
            X.loc[:, col] = pd.to_numeric(X[col], errors='coerce').fillna(0).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return  X_train, X_test, y_train, y_test

def train_Model(X_train, y_train):
    categorical_features = ["education_level", 'gender', "marital_status", 
                            'profession', "residence_district", "residence_neighborhood", 
                            "nationality", 'religion']
    numerical_features = ['age', "weight", "hemoglobin_level", 'days_since_last_don']

    binary_features = ["has_donated_before", "ineligible_antibiotics",
                      "ineligible_low_hemoglobin", "ineligible_recent_donation",
                      "ineligible_recent_sti", "female_ineligible_menstrual",
                      "female_ineligible_breastfeeding", "female_ineligible_postpartum",
                      "female_ineligible_miscarriage", "female_ineligible_pregnant",
                      "total_ineligible_transfusion_history",
                      "total_ineligible_hiv_hbs_hcv",
                      "total_ineligible_surgery", "total_ineligible_sickle_cell",
                      "total_ineligible_diabetes", "total_ineligible_hypertension",
                      "total_ineligible_asthma", "total_ineligible_heart_disease",
                      "total_ineligible_tattoo", "total_ineligible_scarification"] 

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Add balancing
    ])
    
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5]
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Step 4: Training model...")
    print("  Best parameters:", grid_search.best_params_)
    print("  Best CV AUC:", grid_search.best_score_)
    return grid_search.best_estimator_


In [None]:
    donor_candidates_birth, donors = data[0], data[1]
    if donor_candidates_birth is not None:
        filtered_df1, filtered_df12 = apply_filters(age_range, weight_range, gender, district, data)
        cluster_col, profile_col = st.columns([2, 1])
        with cluster_col:
            st.markdown("<div class='chart-container'>", unsafe_allow_html=True)
            st.subheader("Donor Candidates Clustering")
            if 'age' in filtered_df1.columns and 'weight' in filtered_df1.columns:
                X = filtered_df1[['age', 'weight']].dropna()
                if len(X) > 3:
                    kmeans = KMeans(n_clusters=3, random_state=42)
                    clusters = kmeans.fit_predict(X)
                    cluster_df = pd.DataFrame({'Age': X['age'], 'Weight': X['weight'], 'Cluster': ['Cluster ' + str(i+1) for i in clusters]})
                    fig = px.scatter(cluster_df, x='Age', y='Weight', color='Cluster', color_discrete_sequence=['#B22222', '#FF8C00', '#4682B4'], title="Donor Segments by Age and Weight")
                    fig.update_layout(height=500)
                    st.plotly_chart(fig, use_container_width=True)
            st.markdown("</div>", unsafe_allow_html=True)
        with profile_col:
            st.markdown("<div class='metric-container'>", unsafe_allow_html=True)
            st.subheader("Donor Personas")
            st.write("### Cluster 1: Regular Donors")
            st.write("- *Age Range:* 30-45\n- *Weight:* 70-90 kg\n- *Key Motivator:* Altruism")
            st.write("### Cluster 2: Occasional Donors")
            st.write("- *Age Range:* 20-35\n- *Weight:* 60-80 kg\n- *Key Motivator:* Social recognition")
            st.write("### Cluster 3: Family Donors")
            st.write("- *Age Range:* 35-60\n- *Weight:* 65-85 kg\n- *Key Motivator:* Family needs")
            st.markdown("</div>", unsafe_allow_html=True)


arrondissements = ["Douala III", 'Douala', 'Douala V', 'Douala I', 'Yaoundé', 'Douala II', 
                   'Douala IV', 'Bafoussam', 'Dschang', 'Buea', 'Non precise', 'Kribi', 
                   'Njombé', 'Tiko', 'Edéa', 'Manjo', 'West', 'Oyack', 'Deido', 'Douala VI', 
                   'Batie', 'Bomono ba mbegue', 'Meiganga', 'Sud ouest tombel', 
                   'Ngodi bakoko', 'LimbÃ©', 'Dcankongmondo', 'Boko']