In [1]:
import pandas as pd
import re
import requests
from io import BytesIO
import numpy as np
from typing import Optional

In [2]:
file_path = "/Users/shraddhakakade/Documents/Assessments/GHI Project/ghi_project/ors_2015.csv"
main_df = pd.read_csv(file_path)

In [3]:
new_header = main_df.iloc[1]
main_df = main_df.iloc[3:]
main_df.columns = new_header
main_df = main_df.reset_index(drop=True)
main_df.head()

1,Country,WHO Region,Population,Geographical Region,WHO Group,DALY,Adult DALYs,Children DALYs,Retention Rate,Retention Rate (ADULT),...,All ages,Children (0-14),Adults (15+),Year,NaN,http://apps.who.int/gho/data/node.main.626?lang=en,Estimated antiretroviral therapy coverage among people living with HIV (%),Reported number of people receiving antiretroviral therapy,Cleaned coverage,Cleaned number of people receiving antiretroviral therapy
0,Afghanistan,EMR,33736494.0,"East, South and South-East Asia",A,10752.55,9224.37,1528.18,72.0,73.0,...,92,100.0,92.0,2015,,Afghanistan,5 [3-12],364,5.00%,364.0
1,Albania,EUR,2880703.0,Europe and Central Asia,A,98.5,96.6,1.9,92.0,92.0,...,92,77.0,92.0,2015,,Albania,No data,423,,423.0
2,Algeria,AFR,39871528.0,Middle East and North Africa,A,11586.04,11055.12,530.92,92.0,92.0,...,100,,,2015,,Algeria,90 [70->95],7 915,90.00%,7915.0
3,American Samoa,WPR,55537.0,,A,28.52,25.85,2.67,97.14,97.14,...,66,,,2015,,Andorra,High-income country,No data,,
4,Andorra,EUR,78014.0,,A,83.37,83.2,0.17,97.14,97.14,...,85,100.0,85.0,2015,,Angola,29 [20-40],90 204,29.00%,90204.0


In [4]:
# Check data types of columns
print(main_df[['Country', 'WHO Region', 'Population', 'WHO Group', 'DALY', 
    'Adult DALYs', 'Children DALYs', 'Retention Rate', 
    'Retention Rate (ADULT)', 'Retention Rate (CHILD)',
    '3TC', 'ABC', 'AZT', 'ddl', 'd4T', 'EFV', 'FTC', 
    'LPV/r', 'NVP', 'TDF', 'ATV/r', 'Overall Treatment Impact', 
    'Cleaned coverage']].dtypes)

1
Country                     object
Country                     object
WHO Region                  object
Population                  object
WHO Group                   object
DALY                        object
Adult DALYs                 object
Children DALYs              object
Retention Rate              object
Retention Rate (ADULT)      object
Retention Rate (CHILD)      object
3TC                         object
ABC                         object
AZT                         object
ddl                         object
d4T                         object
EFV                         object
FTC                         object
LPV/r                       object
NVP                         object
TDF                         object
ATV/r                       object
Overall Treatment Impact    object
Cleaned coverage            object
dtype: object


In [5]:
main_df.columns

Index([                                                                   'Country',
                                                                       'WHO Region',
                                                                       'Population',
                                                              'Geographical Region',
                                                                        'WHO Group',
                                                                             'DALY',
                                                                      'Adult DALYs',
                                                                   'Children DALYs',
                                                                   'Retention Rate',
                                                           'Retention Rate (ADULT)',
                                                           'Retention Rate (CHILD)',
                                                            '# Re

In [6]:
main_df = main_df.iloc[:, [i for i in range(len(main_df.columns)) if i != 73]]

In [7]:
# ---------- Excel column letters -> 0-based position ----------
def _col_idx(col_letters: str) -> int:
    col_letters = col_letters.strip().upper()
    n = 0
    for ch in col_letters:
        n = n * 26 + (ord(ch) - ord("A") + 1)
    return n - 1

def _to_float(x):
    """Robust float conversion; handles %, commas, blanks."""
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        s = x.strip().replace(",", "")
        if s == "" or s.lower() == "nan":
            return np.nan
        if s.endswith("%"):
            try:
                return float(s[:-1]) / 100.0
            except Exception:
                return np.nan
        try:
            return float(s)
        except Exception:
            return np.nan
    try:
        return float(x)
    except Exception:
        return np.nan

def _drug_token_from_colname(colname) -> str:
    """
    Returns the drug name directly from the column header.
    Since headers are already clean (e.g., '3TC'), we just strip whitespace.
    """
    return str(colname).strip()

def _contains_drug(regimen_text, drug_token: str) -> bool:
    if pd.isna(regimen_text):
        return False
    reg = str(regimen_text).upper().replace(" ", "")
    tok = str(drug_token).upper().replace(" ", "")
    return tok != "" and tok in reg

def _regimen_size(regimen_text) -> float:
    """
    Excel uses /3 or /4 depending on how many drugs are in the regimen.
    We infer from the regimen string (e.g., 'AZT + 3TC + NVP' -> 3).
    """
    if pd.isna(regimen_text):
        return 3.0
    s = str(regimen_text)
    parts = [p.strip() for p in re.split(r"\+", s) if p.strip() and p.strip().lower() != "nan"]
    if len(parts) >= 2:
        return float(len(parts))
    # fallback if someone used commas
    parts2 = [p.strip() for p in s.split(",") if p.strip() and p.strip().lower() != "nan"]
    if len(parts2) >= 2:
        return float(len(parts2))
    return 3.0

def _impact_term(base, const, coef1, var, coef2, denom) -> float:
    """
    Matches pattern:
      base * const * coef1 * var * coef2 / (1 - const*coef1*var*coef2) / denom
    """
    if denom == 0 or pd.isna(denom):
        return 0.0
    if any(pd.isna(v) for v in [base, const, coef1, var, coef2]):
        return 0.0
    prod = const * coef1 * var * coef2
    d = 1.0 - prod
    if d == 0:
        return 0.0
    return (base * prod / d) / denom

In [37]:
def compute_impact_score_2015_dynamic(
    main_df: pd.DataFrame,
    drug_col_index: int,
    output_col: Optional[str] = None,
    excel_row0: int = 5,
    # b_regimen_excel_rows = range(8, 21),
    # a_regimen_excel_rows = range(27, 38),
    first_line_reg_col: str = "AS",
    second_line_reg_col: str = "AS",
    debug: bool = False,
    debug_show_first_n_rows: int = 3,
):


    # --- indices for the country-row inputs (same as your 2017 notebook positions) ---
    idx_E = _col_idx("E")   # WHO Group (A/B)
    idx_G = _col_idx("G")   # Adult DALYs
    idx_H = _col_idx("H")   # Children DALYs
    idx_I = _col_idx("I")   # Retention Rate (overall) used in normalization
    idx_Q = _col_idx("Q")   # adult coverage factor (matches your 2017 code: iloc[16])
    idx_T = _col_idx("T")   # child coverage factor (matches your 2017 code: iloc[19])

    # --- constants columns ---
    idx_AP = _col_idx("AP")
    idx_AQ = _col_idx("AQ")

    # --- regimen block start columns (we’ll use offsets from the regimen-name col) ---
    idx_FL_REG = _col_idx(first_line_reg_col)   # AS
    idx_SL_REG = _col_idx(second_line_reg_col)  # AS

    # First-line: AS (reg), AT (adult%), AU (adult eff), AV (child%), AW (child eff)
    idx_FL_AT = idx_FL_REG + 1
    idx_FL_AU = idx_FL_REG + 2
    idx_FL_AV = idx_FL_REG + 3
    idx_FL_AW = idx_FL_REG + 4

    # Second-line: AS (reg), AT (adult%), AU (adult eff), AV (child%), AW (child eff)
    idx_SL_AT = idx_SL_REG + 1
    idx_SL_AU = idx_SL_REG + 2
    idx_SL_AV = idx_SL_REG + 3
    idx_SL_AW = idx_SL_REG + 4

    # --- excel row -> df index ---
    def r(excel_row: int) -> int:
        return excel_row - excel_row0

    # b_rows = [r(x) for x in b_regimen_excel_rows if 0 <= r(x) < len(main_df)]
    # a_rows = [r(x) for x in a_regimen_excel_rows if 0 <= r(x) < len(main_df)]

    # --- pull constants from absolute rows (row numbers match the Excel formula) ---
    # B constants (AP)
    AP5  = _to_float(main_df.iat[r(5),  idx_AP])
    AP6  = _to_float(main_df.iat[r(6),  idx_AP])
    AP10 = _to_float(main_df.iat[r(10), idx_AP])
    AP11 = _to_float(main_df.iat[r(11), idx_AP])

    # # A constants (AQ)
    # AQ5  = _to_float(main_df.iat[r(5),  idx_AQ])
    # AQ6  = _to_float(main_df.iat[r(6),  idx_AQ])
    # AQ10 = _to_float(main_df.iat[r(10), idx_AQ])
    # AQ11 = _to_float(main_df.iat[r(11), idx_AQ])

    colname = main_df.columns[drug_col_index]
    drug_token = _drug_token_from_colname(colname)

    if output_col is None:
        output_col = f"Computed Impact Score ({drug_token})"

    if debug:
        print("---- DEBUG (constants) ----")
        print(f"Drug column name: {colname!r}  -> token used in search: {drug_token!r}")
        print(f"AP5={AP5}, AP6={AP6}, AP10={AP10}, AP11={AP11}")
        # print(f"AQ5={AQ5}, AQ6={AQ6}, AQ10={AQ10}, AQ11={AQ11}")
        print(f"First-line regimen col: {first_line_reg_col} (idx {idx_FL_REG})")
        print(f"Second-line regimen col: {second_line_reg_col} (idx {idx_SL_REG})")
        print("---------------------------")

    scores = []

    for i in range(len(main_df)):
        row = main_df.iloc[i]

        try:
            who = str(row.iloc[idx_E]).strip().upper()
            G = _to_float(row.iloc[idx_G])
            H = _to_float(row.iloc[idx_H])
            Q = _to_float(row.iloc[idx_Q])
            T = _to_float(row.iloc[idx_T])
            I = _to_float(row.iloc[idx_I])
        except Exception:
            scores.append(0.0)
            continue

        # # Choose regimen block + constants by WHO group
        # if who == "B":
        #     rows_idx = b_rows
        #     k_FL_adult, k_FL_child = AP5, AP10
        #     k_SL_adult, k_SL_child = AP6, AP11
        # else:
        #     rows_idx = a_rows
        #     k_FL_adult, k_FL_child = AQ5, AQ10
        #     k_SL_adult, k_SL_child = AQ6, AQ11

        total = 0.0
        fl_hits = 0
        sl_hits = 0

        # k_FL_adult = _to_float(main_df.iat[r(5),  idx_AP])  # Always use AP5
        # k_SL_adult = _to_float(main_df.iat[r(6),  idx_AP])  # Always use AP6
        # k_FL_child = _to_float(main_df.iat[r(10), idx_AP]) # Always use AP10
        # k_SL_child = _to_float(main_df.iat[r(11), idx_AP]) # Always use AP11

        k_FL_adult = AP5
        k_SL_adult = AP6
        k_FL_child = AP10
        k_SL_child = AP11

        fl_rows = range(0, 14)  # First-line Table: Excel Rows 5 to 18
        sl_rows = range(16, 31) # Second-line Table: Excel Rows 21 to 35

        # First-line Table Loop
        for rr in fl_rows:
            fl_reg = main_df.iat[rr, idx_FL_REG]
            if _contains_drug(fl_reg, drug_token):
                denom = _regimen_size(fl_reg)
                AT = _to_float(main_df.iat[rr, idx_FL_AT])
                AU = _to_float(main_df.iat[rr, idx_FL_AU])
                AV = _to_float(main_df.iat[rr, idx_FL_AV])
                AW = _to_float(main_df.iat[rr, idx_FL_AW])
                total += _impact_term(G, k_FL_adult, AT, Q, AU, denom)
                total += _impact_term(H, k_FL_child, AV, T, AW, denom)
                fl_hits += 1
        
        # Second-line Table Loop
        for rr in sl_rows:
            sl_reg = main_df.iat[rr, idx_SL_REG]
            if _contains_drug(sl_reg, drug_token):
                denom2 = _regimen_size(sl_reg)
                AT = _to_float(main_df.iat[rr, idx_SL_AT])
                AU = _to_float(main_df.iat[rr, idx_SL_AU])
                AV = _to_float(main_df.iat[rr, idx_SL_AV])
                AW = _to_float(main_df.iat[rr, idx_SL_AW])
                total += _impact_term(G, k_SL_adult, AT, Q, AU, denom2)
                total += _impact_term(H, k_SL_child, AV, T, AW, denom2)
                sl_hits += 1

        # Normalize like Excel: /(100/(100-I))
        try:
            norm = 100.0 / (100.0 - I)
            result = total / norm if norm != 0 and not pd.isna(norm) else 0.0
        except Exception:
            result = 0.0

        if debug and i < debug_show_first_n_rows:
            print(f"[DEBUG row {i}] WHO={who}, FL_hits={fl_hits}, SL_hits={sl_hits}, result={result}")

        scores.append(result)

    main_df[output_col] = scores
    return main_df

In [39]:
main_df = compute_impact_score_2015_dynamic(
    main_df,
    drug_col_index=24,   # Impact of 3TC
    output_col="Computed Impact Score (3TC)",
    debug=True
)

main_df[["Country", "WHO Group", "3TC", "Computed Impact Score (3TC)"]].head(10)

---- DEBUG (constants) ----
Drug column name: '3TC'  -> token used in search: '3TC'
AP5=0.9084, AP6=0.0917, AP10=0.8933, AP11=0.10679999999999999
First-line regimen col: AS (idx 44)
Second-line regimen col: AS (idx 44)
---------------------------
[DEBUG row 0] WHO=A, FL_hits=11, SL_hits=10, result=35.65697005376054
[DEBUG row 1] WHO=A, FL_hits=11, SL_hits=10, result=0.0
[DEBUG row 2] WHO=A, FL_hits=11, SL_hits=10, result=186.1546512229944


1,Country,WHO Group,3TC,Computed Impact Score (3TC)
0,Afghanistan,A,35.67,35.65697
1,Albania,A,0.0,0.0
2,Algeria,A,186.17,186.154651
3,American Samoa,A,0.0,0.054973
4,Andorra,A,0.18,0.176934
5,Angola,A,1085.33,1084.641846
6,Anguilla,B,0.0,0.0
7,Antigua and Barbuda,B,0.83,0.826551
8,Argentina,B,3783.18,3782.141604
9,Armenia,A,5.91,5.907704


In [41]:
drug_start_idx = 24
drug_end_idx = 35

for idx in range(drug_start_idx, drug_end_idx):
    # Get the drug name directly from the column header (e.g., "3TC")
    drug = main_df.columns[idx]
    
    # Run the dynamic calculator
    # We pass the index directly so the function knows which drug name to search for
    main_df = compute_impact_score_2015_dynamic(
        main_df,
        drug_col_index=idx,
        output_col=f"Computed Impact Score ({drug})",
        debug=False
    )

In [42]:
computed_cols = [c for c in main_df.columns if "Computed Impact Score" in str(c)]
print(f"Successfully calculated impact for: {computed_cols}")

Successfully calculated impact for: ['Computed Impact Score (3TC)', 'Computed Impact Score (ABC)', 'Computed Impact Score (AZT)', 'Computed Impact Score (ddl)', 'Computed Impact Score (d4T)', 'Computed Impact Score (EFV)', 'Computed Impact Score (FTC)', 'Computed Impact Score (LPV/r)', 'Computed Impact Score (NVP)', 'Computed Impact Score (TDF)', 'Computed Impact Score (ATV/r)']


In [43]:
main_df.columns

Index([                                                                   'Country',
                                                                       'WHO Region',
                                                                       'Population',
                                                              'Geographical Region',
                                                                        'WHO Group',
                                                                             'DALY',
                                                                      'Adult DALYs',
                                                                   'Children DALYs',
                                                                   'Retention Rate',
                                                           'Retention Rate (ADULT)',
                                                           'Retention Rate (CHILD)',
                                                            '# Re

In [13]:
# --- your computed drug impact columns ---
computed_drug_cols_wanted = [
    'Computed Impact Score (3TC)',
    'Computed Impact Score (ABC)',
    'Computed Impact Score (AZT)',
    'Computed Impact Score (ddl)',
    'Computed Impact Score (d4T)',
    'Computed Impact Score (EFV)',
    'Computed Impact Score (FTC)',
    'Computed Impact Score (LPV/r)',
    'Computed Impact Score (NVP)',
    'Computed Impact Score (TDF)',
    'Computed Impact Score (ATV/r)',
]

# only keep the ones that exist right now
computed_drug_cols = [c for c in computed_drug_cols_wanted if c in main_df.columns]
missing = [c for c in computed_drug_cols_wanted if c not in main_df.columns]
if missing:
    print("⚠️ Missing computed columns (not included in overall):")
    for m in missing:
        print("   -", m)

# ensure numeric
for c in computed_drug_cols:
    main_df[c] = pd.to_numeric(main_df[c], errors="coerce")

# per-country overall computed impact
main_df["Computed Overall Treatment Impact"] = main_df[computed_drug_cols].sum(axis=1, skipna=True)


In [14]:
global_computed_total = main_df["Computed Overall Treatment Impact"].sum(skipna=True)
global_actual_total = pd.to_numeric(main_df["Overall Treatment Impact"], errors="coerce").sum(skipna=True)

print("Global totals (sum across countries):")
print("  Computed:", global_computed_total)
print("  Actual  :", global_actual_total)
print("  Diff    :", global_computed_total - global_actual_total)


Global totals (sum across countries):
  Computed: 2628695.054668932
  Actual  : 18060.239999999998
  Diff    : 2610634.814668932


In [15]:
from IPython.display import display

pairs = [
    ("3TC",   "3TC",   "Computed Impact Score (3TC)"),
    ("ABC",   "ABC",   "Computed Impact Score (ABC)"),
    ("AZT",   "AZT",   "Computed Impact Score (AZT)"),
    ("ddl",   "ddl",   "Computed Impact Score (ddl)"),
    ("d4T",   "d4T",   "Computed Impact Score (d4T)"),
    ("EFV",   "EFV",   "Computed Impact Score (EFV)"),
    ("FTC",   "FTC",   "Computed Impact Score (FTC)"),
    ("LPV/r", "LPV/r", "Computed Impact Score (LPV/r)"),
    ("NVP",   "NVP",   "Computed Impact Score (NVP)"),
    ("TDF",   "TDF",   "Computed Impact Score (TDF)"),
    ("ATV/r", "ATV/r", "Computed Impact Score (ATV/r)"),
    ("Overall", "Overall Treatment Impact", "Computed Overall Treatment Impact"),
]

# Keep only pairs where both columns exist
pairs_existing = []
for drug, a, c in pairs:
    if a in main_df.columns and c in main_df.columns:
        pairs_existing.append((drug, a, c))
    else:
        print(f"⚠️ Skipping {drug}: missing column(s):",
              [x for x in [a, c] if x not in main_df.columns])

# Make sure numeric
for _, a, c in pairs_existing:
    main_df[a] = main_df[a].astype(str).str.replace(',', '')
    # main_df[a] = pd.to_numeric(main_df[a], errors="coerce")
    # main_df[c] = pd.to_numeric(main_df[c], errors="coerce")
    main_df[a] = main_df[a].apply(_to_float)
    main_df[c] = main_df[c].apply(_to_float)

# Build comparison table
out = main_df[["Country", "WHO Group"]].copy()

for drug, a, c in pairs_existing:
    out[f"{drug} | Actual"] = main_df[a]
    out[f"{drug} | Computed"] = main_df[c]
    out[f"{drug} | Diff (Comp-Act)"] = main_df[c] - main_df[a]

display(out.head(50))


1,Country,WHO Group,3TC | Actual,3TC | Computed,3TC | Diff (Comp-Act),ABC | Actual,ABC | Computed,ABC | Diff (Comp-Act),AZT | Actual,AZT | Computed,...,NVP | Diff (Comp-Act),TDF | Actual,TDF | Computed,TDF | Diff (Comp-Act),ATV/r | Actual,ATV/r | Computed,ATV/r | Diff (Comp-Act),Overall | Actual,Overall | Computed,Overall | Diff (Comp-Act)
0,Afghanistan,A,35.67,35.65697,-0.01303,3.48,3.473098,-0.006902,15.83,15.803499,...,-0.021208,23.75,23.304535,-0.445465,0.41,0.41325,0.00325,129.23,128.739844,-0.490156
1,Albania,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Algeria,A,186.17,186.154651,-0.015349,3.71,3.70587,-0.00413,70.8,70.774092,...,-0.017841,164.99,162.15127,-2.83873,2.51,2.514908,0.004908,710.73,707.831168,-2.898832
3,American Samoa,A,0.0,0.054973,0.054973,0.0,0.000219,0.000219,0.0,0.020445,...,0.022447,0.0,0.050376,0.050376,0.0,0.000863,0.000863,0.0,0.211947,0.211947
4,Andorra,A,0.18,0.176934,-0.003066,0.0,0.000706,0.000706,0.07,0.065803,...,0.002248,0.17,0.162137,-0.007863,0.0,0.002778,0.002778,0.69,0.682165,-0.007835
5,Angola,A,1085.33,1084.641846,-0.688154,84.82,84.582495,-0.237505,466.43,465.590231,...,-0.595435,780.41,765.713507,-14.696493,13.09,13.1016,0.0116,3984.25,3967.187773,-17.062227
6,Anguilla,B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Antigua and Barbuda,B,0.83,0.826551,-0.003449,0.0,0.003299,0.003299,0.31,0.307399,...,-0.002492,0.77,0.757427,-0.012573,0.01,0.012978,0.002978,3.2,3.186746,-0.013254
8,Argentina,B,3783.18,3782.141604,-1.038396,122.26,121.954029,-0.305971,1502.67,1501.509892,...,-0.872007,3192.02,3134.186965,-57.833035,50.93,50.953069,0.023069,14313.3,14251.899679,-61.400321
9,Armenia,A,5.91,5.907704,-0.002296,0.02,0.024004,0.004004,2.21,2.206543,...,0.000181,5.51,5.405492,-0.104508,0.09,0.094398,0.004398,22.88,22.779895,-0.100105


In [16]:
pd.set_option('display.max_rows', None)
main_df

1,Country,WHO Region,Population,Geographical Region,WHO Group,DALY,Adult DALYs,Children DALYs,Retention Rate,Retention Rate (ADULT),...,Computed Impact Score (AZT),Computed Impact Score (ddl),Computed Impact Score (d4T),Computed Impact Score (EFV),Computed Impact Score (FTC),Computed Impact Score (LPV/r),Computed Impact Score (NVP),Computed Impact Score (TDF),Computed Impact Score (ATV/r),Computed Overall Treatment Impact
0,Afghanistan,EMR,33736494.0,"East, South and South-East Asia",A,10752.55,9224.37,1528.18,72.0,73.0,...,15.803499,0.10269,0.523876,21.382818,7.064748,4.605568,16.408792,23.304535,0.41325,128.739844
1,Albania,EUR,2880703.0,Europe and Central Asia,A,98.5,96.6,1.9,92.0,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Algeria,AFR,39871528.0,Middle East and North Africa,A,11586.04,11055.12,530.92,92.0,92.0,...,70.774092,0.088308,0.455957,140.024797,49.135041,15.754115,77.072159,162.15127,2.514908,707.831168
3,American Samoa,WPR,55537.0,,A,28.52,25.85,2.67,97.14,97.14,...,0.020445,0.0,0.0,0.042529,0.015481,0.004614,0.022447,0.050376,0.000863,0.211947
4,Andorra,EUR,78014.0,,A,83.37,83.2,0.17,97.14,97.14,...,0.065803,0.0,0.0,0.136882,0.049825,0.014851,0.072248,0.162137,0.002778,0.682165
5,Angola,AFR,27859305.0,Sub-Saharan Africa,A,648951.4,485015.9,163935.5,97.14,97.14,...,465.590231,2.456196,12.553451,689.599617,232.443192,127.941072,488.564565,765.713507,13.1016,3967.187773
6,Anguilla,AMR,14723.0,,B,0.0,0.0,0.0,97.14,97.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Antigua and Barbuda,AMR,99923.0,,B,413.85,388.67,25.18,97.14,97.14,...,0.307399,0.0,0.0,0.639448,0.232759,0.069375,0.337508,0.757427,0.012978,3.186746
8,Argentina,AMR,43417765.0,Latin America and the Caribbean,B,78326.48,73886.85,4439.63,66.0,97.14,...,1501.509892,3.138379,16.204321,2719.546475,953.955856,350.571096,1617.737993,3134.186965,50.953069,14251.899679
9,Armenia,EUR,2916950.0,Europe and Central Asia,A,776.95,773.96,2.98,85.0,85.0,...,2.206543,0.0,0.0,4.553953,1.664224,0.503396,2.420181,5.405492,0.094398,22.779895


In [17]:
main_df.to_csv("updated_HIV2015_nb.csv", index=False)
print("File saved as updated_HIV2015_nb.csv")

File saved as updated_HIV2015_nb.csv


In [1]:
# Define the pairs of columns to compare (Actual from dataset vs. your Computed scores)
pairs = [
    ("3TC",   "3TC",   "Computed Impact Score (3TC)"),
    ("ABC",   "ABC",   "Computed Impact Score (ABC)"),
    ("AZT",   "AZT",   "Computed Impact Score (AZT)"),
    ("ddl",   "ddl",   "Computed Impact Score (ddl)"),
    ("d4T",   "d4T",   "Computed Impact Score (d4T)"),
    ("EFV",   "EFV",   "Computed Impact Score (EFV)"),
    ("FTC",   "FTC",   "Computed Impact Score (FTC)"),
    ("LPV/r", "LPV/r", "Computed Impact Score (LPV/r)"),
    ("NVP",   "NVP",   "Computed Impact Score (NVP)"),
    ("TDF",   "TDF",   "Computed Impact Score (TDF)"),
    ("ATV/r", "ATV/r", "Computed Impact Score (ATV/r)"),
    ("Overall", "Overall Treatment Impact", "Computed Overall Treatment Impact"),
]

# Build the comparison dataframe
impact_comparison_df = main_df[["Country", "WHO Group"]].copy()

for drug_label, actual_col, computed_col in pairs:
    if actual_col in main_df.columns and computed_col in main_df.columns:
        # Convert to float to ensure clean comparison
        impact_comparison_df[f"{drug_label}_Actual"] = main_df[actual_col].apply(_to_float)
        impact_comparison_df[f"{drug_label}_Computed"] = main_df[computed_col].apply(_to_float)
        # Optional: Add a difference column for easy validation
        impact_comparison_df[f"{drug_label}_Diff"] = impact_comparison_df[f"{drug_label}_Computed"] - impact_comparison_df[f"{drug_label}_Actual"]

# Save to CSV
impact_comparison_df.to_csv("impact_score.csv", index=False)
print("File 'impact_score.csv' has been created successfully.")

NameError: name 'main_df' is not defined