In [1]:
import pandas as pd
import json

from common_utils import get_config
from pseudo_anon_utils import load_sid_codes

In [2]:
config = get_config()
server = config["test_server"]
logger = None

In [3]:
sid_df = load_sid_codes(server=server, logger=logger)

In [4]:
sid_df = sid_df[["StudyID", "TCode"]].drop_duplicates()
sid_df["StudyID"] = sid_df["StudyID"].astype(int)

In [5]:
csv_path = r"N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Data_Output_Testing\temp_fam_hist\Q14famhistcancer.csv"

In [6]:
csv_studyid = "studyid"
fam_hist_cols = ["famca", "famcaN", "fambrca", "fambrcaN", "famovca", "famovcaN", "famcolorec", "famcolorecN", "famprostate", "famprostateN"]

In [7]:
df_csv = pd.read_csv(csv_path, usecols=[csv_studyid] + fam_hist_cols)

df_csv[csv_studyid] = df_csv[csv_studyid].astype(int)

In [8]:
df_merged = sid_df.merge(
    df_csv,
    left_on="StudyID",
    right_on=csv_studyid,
    how="left",
    validate="m:1"
) 

In [9]:
missing = df_merged[df_merged["TCode"].isna()][csv_studyid].unique()

if len(missing) > 0:
    print(f"WARNING: {len(missing)} StudyID(s) in the CSV were not found in SIDCodes")

In [10]:
df_merged = df_merged.drop(columns=["studyid", "StudyID"])

In [None]:
rename_map = {
    "TCode": "R0_TCode",
    "famca": "R0_FamHistCancer",
    "famcaN": "R0_FamHistCancerNum", 
    "fambrca": "R0_FamHistBC", 
    "fambrcaN": "R0_FamHistBCNum", 
    "famovca": "R0_FamHistOV", 
    "famovcaN": "R0_FamHistOVNum", 
    "famcolorec": "R0_FamHistColo", 
    "famcolorecN": "R0_FamHistColoNum", 
    "famprostate": "R0_FamHistProst", 
    "famprostateN": "R0_FamHistProstNum"}

df_merged = df_merged.rename(columns=rename_map)

In [12]:
result = df_merged.to_dict(orient="records")

In [13]:
output_path = r"N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Data_Output_Testing\temp_fam_hist\fam_hist.json"

with open(output_path, "w") as f:
    json.dump(result, f, indent=2)

In [14]:
result

[{'R0_TCode': 'T261E618',
  'R0_FamHistCancer': 1.0,
  'R0_FamHistCancerNum': 1.0,
  'R0_FamHistBC': 1.0,
  'R0_FamHistBCNum': 1.0,
  'R0_FamHistOV': 0.0,
  'R0_FamHistOVNum': 0.0,
  'R0_FamHistColo': 0.0,
  'R0_FamHistColoNum': 0.0,
  'R0_FamHistProst': 0.0,
  'R0_FamHistProstNum': 0.0},
 {'R0_TCode': 'T261F619',
  'R0_FamHistCancer': 1.0,
  'R0_FamHistCancerNum': 1.0,
  'R0_FamHistBC': 0.0,
  'R0_FamHistBCNum': 0.0,
  'R0_FamHistOV': 0.0,
  'R0_FamHistOVNum': 0.0,
  'R0_FamHistColo': 0.0,
  'R0_FamHistColoNum': 0.0,
  'R0_FamHistProst': 0.0,
  'R0_FamHistProstNum': 0.0},
 {'R0_TCode': 'T261H621',
  'R0_FamHistCancer': 0.0,
  'R0_FamHistCancerNum': 0.0,
  'R0_FamHistBC': 0.0,
  'R0_FamHistBCNum': 0.0,
  'R0_FamHistOV': 0.0,
  'R0_FamHistOVNum': 0.0,
  'R0_FamHistColo': 0.0,
  'R0_FamHistColoNum': 0.0,
  'R0_FamHistProst': 0.0,
  'R0_FamHistProstNum': 0.0},
 {'R0_TCode': 'T261J623',
  'R0_FamHistCancer': 0.0,
  'R0_FamHistCancerNum': 0.0,
  'R0_FamHistBC': 0.0,
  'R0_FamHistBCNum': 0.0