In [8]:
import numpy as np
import pandas as pd
import regex
from dateutil.parser import parse

In [5]:
# Read clinical data files
path = '../data/'
clinical_all_df = pd.read_csv(path + 'MAARS_all_Fri_Apr_04_14h_CEST_2014.csv', sep='\t')
clinical_ad_df = pd.read_csv(path + 'MAARS_AD_full_20190131_12-34-49.csv', sep='\t')
clinical_ctrl_df = pd.read_csv(path + 'MAARS_Control_full_20190131_12-40-12.csv', sep='\t')
clinical_pso_df = pd.read_csv(path + 'MAARS_PSO_full_20190131_12-40-53.csv', sep='\t')

In [6]:
# Create function to change column names
def changeColName(col):
    """ This function takes a string as an input and applies following transformations:
        1) Finds and removes last substring between parenthesis
        2) Splits the string based on '#' and removes duplicate items
        3) Trims strings, collates back into one string and replaces spaces with _"""
    
    # Extract latest string between outermost parenthesis
    final = regex.findall('\(((?>[^\(\)]+|(?R))*)\)', col)
    if len(final): col = col.replace('(' + final[-1] + ')','')
    
    # Deconstruct into list of tags
    tags = col.split('#')
    
    # Remove duplicated tags
    tags, idx = np.unique(tags,return_index=True)
    tags = tags[np.argsort(idx)].tolist()
    
    # Trim strings, collate into one string and replace spaces with _
    tags = list(map(str.strip, tags))
    s = '->'.join(tags)
    s = s.replace(' ', '_')
    
    return s

# Merge all columns into a single array and write mapping
all_cols = pd.concat([pd.Series(clinical_ad_df.columns), pd.Series(clinical_pso_df.columns), pd.Series(clinical_ctrl_df.columns)]).drop_duplicates()
pd.concat([all_cols.map(lambda x: changeColName(x)), pd.Series(all_cols)], axis=1).rename(columns={0:'new_name',1:'old_name'}).to_csv('columns_mapping.csv')

# Change column names using changeColName
clinical_ad_df.columns = clinical_ad_df.columns.map(lambda x: changeColName(x))
clinical_pso_df.columns = clinical_pso_df.columns.map(lambda x: changeColName(x))
clinical_ctrl_df.columns = clinical_ctrl_df.columns.map(lambda x: changeColName(x))

# Write updated data files
file_suffix = '_new_cols'
ext = '.csv'
clinical_ad_df.to_csv('MAARS_AD_full_20190131_12-34-49' + file_suffix + ext , sep='\t', index=False)
clinical_ctrl_df.to_csv('MAARS_Control_full_20190131_12-40-12' + file_suffix + ext , sep='\t', index=False)
clinical_pso_df.to_csv('MAARS_PSO_full_20190131_12-40-53' + file_suffix + ext , sep='\t', index=False)

In [9]:
def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

In [10]:
def apply_mask(mask, dataframe):
    all_columns = dataframe.apply( lambda x: mask(x.name, dataframe))
    columns = all_columns.index[all_columns == True]
    return columns
def mask_clinic(x, dataframe):
    if x != 'sample_id' and x != 'MAARS_identifier' and x != 'CUSTOM_Age':
        return True
    else:
        return False
def mask_all(x, dataframe):
    remarks_comments = ('remarks' in x.lower()) | ('comments' in x.lower())
    dates = ('date' in x.lower()) | (dataframe[x].astype(str).apply(is_date)).any()
    all_nan = (dataframe[x].astype(str).str.contains('nan')).all()
    identifier = ('maars'in x.lower())
    multiple = (dataframe[x].astype(str).str.contains(';')).any()
    
    if  remarks_comments | dates | all_nan | identifier | multiple:
        return False
    else:
        return True

In [11]:
from sklearn.preprocessing import LabelEncoder
def encode(dataframe, mask):
    columns = apply_mask(mask, dataframe)
    new = pd.DataFrame()
    lookup_table = pd.DataFrame(columns = ['column', 'lookup_values'], index = range(len(columns)))
    index = 0
    for col in dataframe.columns:
        if col in columns:
            mask_null = dataframe[col].isnull()
            le = LabelEncoder()
            encoded = le.fit_transform(dataframe[col].astype(str).apply(lambda x: x.lower()))
            le_name_mapping = dict(zip(le.transform(le.classes_), le.classes_))
            if 'nan' in le_name_mapping.keys():
                del le_name_mapping['nan']
            lookup_table.loc[index, :] = [col,le_name_mapping]
            index+=1        
            new[col] = encoded
            new[col]  = new[col].where(~mask_null, dataframe[col])
        else:
            new[col] = dataframe[col]
    return new, lookup_table

In [12]:
new_clinic, lt_clinic = encode(clinical_all_df, mask_clinic)
new_ad_full, lt_ad_full = encode(clinical_ad_df, mask_all)
new_ctrl_full, lt_ctrl_full = encode(clinical_ctrl_df, mask_all)
new_pso_full, lt_pso_full = encode(clinical_pso_df, mask_all)

In [13]:
new_clinic.head()

Unnamed: 0,sample_id,clinical_group,anatomical_location,anatomical_location_label,lesional,MAARS_identifier,Institution,CUSTOM_Age,Gender,Known_Allergies_v2..Pseudo_Drug_Allergy,...,Other_concurrent_chronic_diseases_v2..Others,Other_concurrent_chronic_diseases_v2..Diabetes_.non.insulin.,Other_concurrent_chronic_diseases_v2..Thyroid_dysfunction,Other_concurrent_chronic_diseases_v2..Asthma,Other_concurrent_chronic_diseases_v2..Hypertension,Global_Assessment_Score,CUSTOM_Malignancies_._skin,CUSTOM_Malignancies_._other,CUSTOM_Fam._hist._Atopic_dermatitis,CUSTOM_Family_History_of_Psoriasis
0,MAARS_3_018_02,0,2,2,1,MAARS_3_018,2,25,1,0,...,0,0,0,0,0,2,0,0,0,
1,MAARS_3_018_01,0,2,2,0,MAARS_3_018,2,25,1,0,...,0,0,0,0,0,2,0,0,0,
2,MAARS_3_018_03,0,2,2,0,MAARS_3_018,2,25,1,0,...,0,0,0,0,0,2,0,0,0,
3,MAARS_3_018_04,0,2,2,1,MAARS_3_018,2,25,1,0,...,0,0,0,0,0,2,0,0,0,
4,MAARS_3_082_02,0,2,2,1,MAARS_3_082,2,62,1,0,...,0,0,0,0,0,2,0,0,2,
