# Process Data Table over Lung Cancer Patient
Reference on [Process Description](https://www.notion.so/Causal-Analysis-over-Lung-Cancer-Patient-38668ea27d0e49df88791d7e4b841e98#e044372a34994bc9b547b7e4994d33f5)

# Data Processing into 'generated' directory

In the relational database, each patient doesn't have multiple records for same family with same cancer. 
For example, we don't have patient1 who has 2 aunt with breast cancer

SELECT t.ehr, COUNT(CONCAT(t.family_member, ',', t.cancer_type)) as num, COUNT(DISTINCT CONCAT(t.family_member, ',', t.cancer_type)) as uniq_num FROM `family_antecedents_treatment_line` t group by t.ehr,t.treatment_line ORDER by num DESC

# Analysis

In [13]:
import pandas as pd
import numpy as np
import copy
from collections import Counter
from itertools import combinations

def process_df(df1):
    for col in df1.columns:
        if 'Patient_id' == col:
            continue
        if ('UNK' in df1[col].unique()) :
            print(col, 'UNK')
        if ('-' in df1[col].unique()):
            print(col, '-')
        df1[col] = df1[col].fillna("UNK")
        df1[col] = df1[col].astype(str).apply(lambda x: x.strip())
    df1 = df1.replace(to_replace={"UNK":np.nan, '-': np.nan})
    return df1

In [14]:
##########################################################################################
##               Data Preprocessing  load patient's info
##########################################################################################

# 1. single value
gender_age_smoker_df = pd.read_csv('genereated/gender_age_smoker.csv')
age_threshold = 50
gender_age_smoker_df.Age = gender_age_smoker_df.Age.apply(lambda x: 'Young' if x <= age_threshold else 'Old' if x > age_threshold else np.nan)
gender_age_smoker_df = process_df(gender_age_smoker_df)

# 2. multi-vlaue
mutation_df = process_df(pd.read_csv('genereated/mutation.csv'))
family_df = process_df(pd.read_csv('genereated/family_cancer_fgender_degree.csv'))


Family UNK
FamilyCancer -
FamilyDegree UNK


In [37]:
print(gender_age_smoker_df.Smoker.unique())
print(gender_age_smoker_df.Gender.unique())
print(gender_age_smoker_df.Age.unique())
print()
print(mutation_df.Biomarker.unique())
print()
print(family_df.Family.unique())
print(family_df.FamilyGender.astype(str).unique())
print(family_df.FamilyDegree.unique())
print(family_df.FamilyCancer.unique())

s1 = set(family_df.loc[(family_df.Family=='No')].Patient_id)
s2 = set(family_df.loc[(family_df.Family!='No') &(family_df.Family.notnull())].Patient_id)
print(s1.intersection(s2))

# family_df.loc[(family_df.Family.notnull())].Family.unique()

['Current-Smoker' 'Former-Smoker' nan 'Never-Smoker']
['Male' 'Female']
['Old' 'Young' nan]

['KRAS' 'HER2Mut' 'MET' 'BRAF' 'RET' 'FGFR1' 'ALK' 'EGFR' 'HER2' 'ROS1'
 'PDL1' 'NoMutation' nan]

['Father' 'Brother' 'No' nan 'Uncle' 'Sister' 'Mother' 'Female_Cousin'
 'Grandfather' 'Aunt' 'Grandmother' 'Male_Cousin' 'Daughter' 'Son'
 'Greatgrandmother' 'Niece' 'Nephew' 'Halfbrother' 'Greatgrandfather'
 'Grandson' 'Halfsister' 'Granddaughter']
['Male' 'No' 'nan' 'Female']
['First_Degree' 'No' nan 'Second_Degree' 'Third_Degree']
['Lung' 'No' nan 'Esophagogastric' 'Head and neck' 'Others' 'Breast'
 'Renal' 'Prostate' 'Uterus/cervical' 'Leukemia' 'Colorrectal' 'Liver'
 'Pancreatic' 'Unknown origin carcinoma' 'Melanoma'
 'Central nervous system' 'Bladder/urinary tract' 'Lymphoma' 'Sarcoma'
 'Skin no melanoma' 'Ovarian' 'Germinal tumors' 'Gall bladder']
set()


In [1]:
def sub_pop(gender=None, age=None, biomarker=None, smoking=None):
    df = copy.deepcopy(gender_age_smoker_df)
    
    if gender:
        if gender not in df.Gender.unique():
            raise Exception("the value of Gender {gender} is not acceptable".format(gender))
        df = df.loc[(df.Gender == gender)]
    if age:
        if age not in df.Age.unique():
            raise Exception("the value of Age {age} is not acceptable".format(age))
        df = df.loc[(df.Age == age)]
    
    if smoking:
        if smoking not in df.Smoker.unique():
            raise Exception("the value of smoker {smoking} is not acceptable".format(smoking))
        df = df.loc[(df.Smoker == smoking)]

    if biomarker:
        if biomarker not in mutation_df.Biomarker.unique():
            raise Exception("the value of biomarker {biomarker} is not acceptable")
        p_ids = set(mutation_df.loc[(mutation_df.Biomarker == biomarker)].Patient_id)
        df = df.loc[(df.Patient_id.isin(p_ids))]
    
    # print('total sub_pop:', len(df.Patient_id.unique()))
    # sub population
    return set(df.Patient_id)


def get_normalized_family_cancer(p_ids, allow_mutation):
    '''
    biomarker not null,
    smoker not null,
    age not null,
    gender not null,
    family not null,
    '''
    
    # allow biomarker, gender, smoker, age, family, familycancer shouldn't be null
    p_ids = p_ids.intersection(set(mutation_df.loc[mutation_df.Biomarker.isin(allow_mutation)].Patient_id))
    p_ids = p_ids.intersection(set(gender_age_smoker_df.loc[(gender_age_smoker_df.Smoker.notnull()) & 
    (gender_age_smoker_df.Gender.notnull()) & 
    (gender_age_smoker_df.Age.notnull())].Patient_id))
    df1 = family_df.loc[(family_df.Family.notnull()) & family_df.FamilyCancer.notnull() & (family_df.Patient_id.isin(p_ids))][['Patient_id', 'Family', 'FamilyCancer']]
    
    # normalized familiar cancer frequency
    df1['family_cancer'] = df1['Family'] + df1['FamilyCancer']
    df1['family_cancer_num'] = df1.groupby('Patient_id')['family_cancer'].transform('count')
    df1['family_cancer_num'].mask(df1.Family == 'No', 0)    # no family, the number should be 0
    df1['Normalized Familial Cancer Frequency'] = df1['family_cancer_num'] / df1['family_cancer_num'].max()
    df1 = df1.drop(columns=['Family', 'FamilyCancer', 'family_cancer', 'family_cancer_num'])
    
    # include the biomarker and smoker type of patients for box plot
    df1 = pd.merge(left=df1, right=mutation_df[['Patient_id', 'Biomarker']], on='Patient_id', how='inner')
    df1 = pd.merge(left=df1, right=gender_age_smoker_df[['Patient_id', 'Smoker']], on='Patient_id', how='inner')
    
    return df1

def jaccard(ls1:list, ls2:list):
    s1 = set(ls1)
    s2 = set(ls2)
    c1 = Counter(ls1)
    c2 = Counter(ls2)

    up = 0
    down = 0
    for e in s1.intersection(s2):
        up += min(c1[e], c2[e])
        down += max(c1[e], c2[2])
    return 0.0 if up == 0 else up / (down + sum([c1[e] for e in s1.difference(s2)]) + sum([c2[e] for e in s2.difference(s1)]))

def Jaccard_df(df1, simple_version=True, jacard_choice=0):
    
    if jacard_choice == 0:  # family type + cancer type
        df1 = df1.loc[(df1.FamilyCancer.notnull()) & (df1.Family.notnull()) & (df1.Family.notna()) & (df1.FamilyCancer.notna())]
        df1['jacard_attribute'] = df1['Family']+ df1['FamilyCancer']
    elif jacard_choice == 1:    # fanily type
        df1 = df1.loc[(df1.Family.notnull()) & (df1.Family.notna())]
        df1['jacard_attribute'] = df1['Family']
    elif jacard_choice == 2:    # cancer type
        df1 = df1.loc[(df1.FamilyCancer.notnull()) & (df1.FamilyCancer.notna())]
        df1['jacard_attribute'] = df1['FamilyCancer']
    elif jacard_choice == 3:    # degree
        df1 = df1.loc[(df1.FamilyDegree.notnull()) & (df1.FamilyDegree.notna())]
        df1['jacard_attribute'] = df1['FamilyDegree']
    elif jacard_choice == 4:    # degree + cancer type
        df1 = df1.loc[(df1.FamilyDegree.notnull()) & (df1.FamilyCancer.notnull()) & (df1.FamilyCancer.notna()) & (df1.FamilyDegree.notna())]
        df1['jacard_attribute'] = df1['FamilyDegree'] + df1['FamilyCancer']
    elif jacard_choice == 5:    # gender
        df1 = df1.loc[(df1.FamilyGender.notnull()) & (df1.FamilyGender.notna())]
        df1['jacard_attribute'] = df1['FamilyGender']
    elif jacard_choice == 6:    # gender + cancer
        df1 = df1.loc[(df1.FamilyGender.notnull()) & (df1.FamilyCancer.notnull()) & (df1.FamilyCancer.notna()) & (df1.FamilyGender.notna())]
        df1['jacard_attribute'] = df1['FamilyGender'] + df1['FamilyCancer']
    else:
        raise Exception("No such a choice={choice}")
    
    pop_ids = set(df1.Patient_id.unique())
    family_cancer_series = df1.groupby('Patient_id').apply(lambda x: x.jacard_attribute.tolist())

    result_df = {'Familial Cancer Connectedness':[]}
    
    for p_id1, p_id2 in list(combinations(df1.Patient_id.unique().tolist(), 2)):
        list1 = family_cancer_series[p_id1]
        list2 = family_cancer_series[p_id2]
        s1 = set(list1)
        s2 = set(list2)

        if len(s1) == 0 or len(s2) == 0:
            result_df['Familial Cancer Connectedness'].append(0)
            continue

        if 'No' in s1 or 'No' in s2 or 'NoNo' in s1 or 'NoNo' in s2:    # one of patient has no family with cancer
            result_df['Familial Cancer Connectedness'].append(0)
            continue
        
        jaccard_val = len(s1.intersection(s2))/len(s1.union(s2)) if simple_version else jaccard(list1, list2)
        
        if np.isnan(jaccard_val):
            raise Exception("jaccard_val is np.nan; ", "s1:", s1, "s2", s2)
        result_df['Familial Cancer Connectedness'].append(jaccard_val)
    
    result_df = pd.DataFrame(result_df)
    # print( gender, age, smoker, biomarker, "\t\tpop size", len(df1.Patient_id.unique()), '\t\tavg-Jaccard:', '{:.4f}'.format(result_df['Familial Cancer Connectedness'].mean()))
    return result_df, pop_ids


def box_plot(df, y_var, x_var, hue_var, title, fn):
    import seaborn as sns
    import matplotlib.pyplot as plt
    sns.set_theme(style="ticks", palette="husl")
    sns.set_palette("pastel")
    
    # Draw a nested boxplot to show bills by day and time
    ax = sns.boxplot(x=x_var, y=y_var,
                hue=hue_var, 
                hue_order=["Never-Smoker", "Former-Smoker", 'Current-Smoker'],
                order=sorted(df[x_var].unique().tolist()),
                palette={"Never-Smoker": "green", "Former-Smoker": "blue", 'Current-Smoker': "red"},
                data=df)
    ax.set(title = title)
    
    plt.subplots_adjust(bottom=0.25)  
    if max([len(e) for e in df[x_var].unique()]) > 7:
        plt.setp(ax.get_xticklabels(), rotation=25)
        
    plt.ylim([0, 1+0.13])
    
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.0),
          ncol=3, fancybox=True, framealpha=0.5, shadow=False, fontsize=9)
          
    ax.get_figure().savefig(fn)
    plt.clf()
    



# Normalized Familial Cancer Frequency
the gender, age, smoker, biomarker, family should not be NULL

In [15]:
def family_cancer_frequency():
    import os
    for gender in ['Male', 'Female']:
        for age in ['Old', 'Young']:
            p_ids = sub_pop(gender=gender, age=age)
            df = get_normalized_family_cancer(p_ids, allow_mutation=['ALK', 'BRAF', 'EGFR', 'KRAS', 'MET', 'NoMutation', 'PDL1', 'ROS1', 'FGFR1', 'HER2', 'HER2Mut', 'RET'])
            
            pop_size = len(df.Patient_id.unique())
            if not os.path.exists('jacard_index/age={age_threshold}'.format(age_threshold=age_threshold)):
                os.mkdir('jacard_index/age={age_threshold}'.format(age_threshold=age_threshold))
            if not os.path.exists('jacard_index/age={age_threshold}/Family_Cancer_Frequency/'.format(age_threshold=age_threshold)):
                    os.mkdir('jacard_index/age={age_threshold}/Family_Cancer_Frequency/'.format(age_threshold=age_threshold))
            title = age+"-"+gender+" (age"+((" <= "+str(age_threshold)) if age == 'Young' else (" > "+str(age_threshold)))+")    pop-size="+str(pop_size)
            box_plot(df, y_var='Normalized Familial Cancer Frequency', x_var='Biomarker', hue_var='Smoker', title=title, fn='jacard_index/age={age_threshold}/Family_Cancer_Frequency/'.format(age_threshold=age_threshold)+title+".png")

# Jacard Index

## Jaccard Index for biomarker (ALK,EGFR), x=cancer
the gender, age, smoker, family should not be NULL
biomarker is limited to be ALK or EGFR

In [16]:
def jaccard_x_is_cancer():
    import os
    for gender in ['Male', 'Female']:
        for age in ['Old', 'Young']:
            for biomarker in ['ALK', 'EGFR']:
                for choice, choice_describe in {0: "Family Type + Familial Cancer Type", 1:"Family Type", 2: "Familial Cancer Type", 3: "Degree", 4: "Degree + Familial Cancer Type",
                                                5: 'Familial Gender', 6: "Familial Gender + Familial Cancer Type"}.items():
                    if 'Cancer' in choice_describe:
                        continue
                    
                    pop_ids = set()
                    df_analysis = pd.DataFrame({'Familial Cancer Connectedness':[], 'Familial Cancer':[], 'Smoker':[]})
                    
                    for cancer in ['Leukemia','Liver','Uterus/cervical','Pancreatic','Prostate','Head and neck','Esophagogastric','Colorrectal','Breast','Lung']:
                        for smoker in ['Never-Smoker', 'Former-Smoker', 'Current-Smoker']:
                            # subpopulation selection
                            p_ids = set(gender_age_smoker_df.loc[(gender_age_smoker_df.Gender==gender) & (gender_age_smoker_df.Age==age) & (gender_age_smoker_df.Smoker == smoker)].Patient_id)
                            p_ids = p_ids.intersection(set(mutation_df.loc[(mutation_df.Biomarker == biomarker)].Patient_id))
                            p_ids = p_ids.intersection(set(family_df.loc[(family_df.FamilyCancer==cancer)].Patient_id))
                            
                            df1 = family_df.loc[family_df.Patient_id.isin(p_ids)]
                            df2, sub_pop_ids = Jaccard_df(df1, simple_version=False, jacard_choice=choice)
                            
                            df2['Familial Cancer'] = [cancer]*len(df2)
                            df2['Smoker'] = [smoker]*len(df2)
                            
                            df_analysis = pd.concat([df_analysis, df2])
                            pop_ids = pop_ids | sub_pop_ids
                    
                    title = age+"-"+gender+"-"+biomarker+" (age"+((" <= "+str(age_threshold)) if age == 'Young' else (" > "+str(age_threshold)))+")    pop-size="+str(len(pop_ids))
                    if not os.path.exists('jacard_index/age={age_threshold}'.format(age_threshold=age_threshold)):
                        os.mkdir('jacard_index/age={age_threshold}'.format(age_threshold=age_threshold))
                    if not os.path.exists('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold)):
                        os.mkdir('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold))
                    if not os.path.exists('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold)+choice_describe):
                        os.mkdir('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold)+choice_describe)
                    if not os.path.exists(os.path.join('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold)+choice_describe, biomarker)):
                        os.mkdir(os.path.join('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold)+choice_describe, biomarker))
                    
                    box_plot(df_analysis, y_var='Familial Cancer Connectedness', x_var='Familial Cancer', hue_var='Smoker', title=title, 
                    fn=os.path.join('jacard_index/age={age_threshold}/Jaccard_X_is_Cancer/'.format(age_threshold=age_threshold)+choice_describe, biomarker,title+".png"))



# Generate all box plot
remember to change the age_threshold

In [19]:
# 1. single value
gender_age_smoker_df = pd.read_csv('genereated/gender_age_smoker.csv')
age_threshold = 65
gender_age_smoker_df.Age = gender_age_smoker_df.Age.apply(lambda x: 'Young' if x <= age_threshold else 'Old' if x > age_threshold else np.nan)
gender_age_smoker_df = process_df(gender_age_smoker_df)

# 2. multi-vlaue
mutation_df = process_df(pd.read_csv('genereated/mutation.csv'))
family_df = process_df(pd.read_csv('genereated/family_cancer_fgender_degree.csv'))


family_cancer_frequency()
jaccard_x_is_cancer()

Family UNK
FamilyCancer -
FamilyDegree UNK


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['jacard_attribute'] = df1['Family']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['jacard_attribute'] = df1['FamilyDegree']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['jacard_attribute'] = df1['FamilyGender']
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

<Figure size 432x288 with 0 Axes>