# Create dataset for complete mapping

In [None]:
import pandas as pd
from map_vars import all_variables, common_variables
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
map_variables_func = all_variables
test_name = 'test1'
use_race = None

In [None]:
if use_race == None:
    df_us = pd.read_csv('../../data/CSL_StudyItems/dataset_americans.csv')
else:
    df_us = pd.read_csv('../../data/CSL_StudyItems/dataset_americans_with_race.csv')
    # select pretended race
    df_us = pd.DataFrame(df_us.loc[df_us['momrace_new'] == use_race])
    df_us.drop(columns=['momrace_new'], inplace=True)

In [None]:
df_pt = pd.read_csv('../../data/PT/PT_dataset.csv')

## get variables mapping

In [None]:
us_vars = []
pt_vars = []

for key, value in map_variables_func.items():
    if isinstance(value, dict):
        for subkey, subvalue in value.items():
            pt_vars.append((key+'_'+subkey).replace(' ', ''))
            us_vars.append(subvalue)
    else:
        pt_vars.append(key.replace(' ', ''))
        us_vars.append(value)

In [None]:
# test
df_us[us_vars].head()

In [None]:
# test
df_pt[pt_vars].head()

# handle duplicate variables

In [None]:
map_vars = pd.DataFrame(zip(pt_vars, us_vars), columns=['PT', 'US'])

In [None]:
repetitive_vars = map_vars[map_vars.duplicated('US', keep=False)]

In [None]:
repetitive_vars

In [None]:
duplicate_vars = {}
for pt, us in repetitive_vars.itertuples(index=False):
    if us in duplicate_vars:
        duplicate_vars[us].append(pt)
    else:
        duplicate_vars[us] = [pt]

In [None]:
duplicate_vars

In [None]:
for us, pts in duplicate_vars.items(): # use the first one as the main variable
    df_pt[us] = df_pt[pts].max(axis=1)
    df_pt.drop(columns=pts, inplace=True)
    map_vars = map_vars.loc[map_vars['PT'].isin(pts) == False]
    # add the new mapping
    map_vars = pd.concat([map_vars, pd.DataFrame(dict(PT=[us], US=[us]))], ignore_index=True)

In [None]:
len(map_vars)

### Create datasets with the same columns

In [None]:
data_pt = pd.DataFrame(df_pt[map_vars['PT']])

In [None]:
data_us = pd.DataFrame(df_us[map_vars['US']])
data_us.columns = map_vars['PT']

In [None]:
data_us.Class.value_counts(normalize=True)

In [None]:
data_pt.Class.value_counts(normalize=True)

### confirm results

In [None]:
dtypes = pd.concat([data_pt.dtypes, data_us.dtypes], axis=1)
dtypes.columns = ['PT', 'US']

In [None]:
different_dtypes = dtypes.loc[dtypes['PT'] != dtypes['US']]

In [None]:
for feature in different_dtypes.index:
    print(feature)
    print(f'PT: {data_pt[feature].dtype}, US: {data_us[feature].dtype}')
    print('PT', data_pt[feature].unique())
    print('US', data_us[feature].unique())


### convert class to bool

In [None]:
data_pt['Class'] = data_pt['Class'].astype(int)

In [None]:
data_us['Class'] = data_us['Class'].astype(int)

### convert boolean features from bool to float in US dataset

In [None]:
bool_features = data_us.select_dtypes(include=bool).columns

In [None]:
for feature in bool_features:
    data_us[feature] = data_us[feature].astype(float)

### handle features with different values

In [None]:
dtypes = pd.concat([data_pt.dtypes, data_us.dtypes], axis=1)
dtypes.columns = ['PT', 'US']
different_dtypes = dtypes.loc[dtypes['PT'] != dtypes['US']]

In [None]:
for feature in different_dtypes.index:
    print(feature)
    print(f'PT: {data_pt[feature].dtype}, US: {data_us[feature].dtype}')
    print('PT', data_pt[feature].unique())
    print('US', data_us[feature].unique())


In [None]:
data_pt['Paridade'] = data_pt['Paridade'].astype(int)

In [None]:
data_pt['Idade'] = data_pt['Idade'].astype(float)

In [None]:
data_pt['AE'] = data_pt['AE'].astype(float)

### convert CSAAnt to int and drop nulls

In [None]:
data_us['CSAAnt'] = data_us['CSAAnt'].astype(float)
data_pt['CSAAnt'] = data_pt['CSAAnt'].astype(float)

### convert PPTAnterior to True or False

In [None]:
data_pt['PPTAnterior'] = (data_pt['PPTAnterior'] >= 1).astype(float)

In [None]:
try:
    data_us['PatologiasPrevias_2-hipotiroidismo'] = data_us['PatologiasPrevias_2-hipotiroidismo'].astype(float)
except Exception as e:
    print(e)

In [None]:
data_us['IG'] = data_us.IG.round(0).astype(int) # convert IG to weeks and without floats

# confirm transformations

In [None]:
dtypes = pd.concat([data_pt.dtypes, data_us.dtypes], axis=1)
dtypes.columns = ['PT', 'US']
different_dtypes = dtypes.loc[dtypes['PT'] != dtypes['US']]

In [None]:
for feature in different_dtypes.index:
    print(feature)
    print(f'PT: {data_pt[feature].dtype}, US: {data_us[feature].dtype}')
    print('PT', data_pt[feature].unique())
    print('US', data_us[feature].unique())


### save datasets

In [None]:
import os
save_path = f'../../data/{test_name}'
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [None]:
if use_race:
    data_pt.to_csv(f'../../data/{test_name}/PT_dataset_mapped_{use_race}.csv', index=False)
    data_us.to_csv(f'../../data/{test_name}/US_dataset_mapped_{use_race}.csv', index=False)
else:
    data_pt.to_csv(f'../../data/{test_name}/PT_dataset_mapped.csv', index=False)
    data_us.to_csv(f'../../data/{test_name}/US_dataset_mapped.csv', index=False)

In [None]:
combine_dataset = pd.concat([data_pt, data_us], ignore_index=True)

In [None]:
if use_race:
    combine_dataset.to_csv(f'../../data/{test_name}/combine_dataset_mapped_{use_race}.csv', index=False)
else:
    combine_dataset.to_csv(f'../../data/{test_name}/combine_dataset_mapped.csv', index=False)

### save

In [None]:
pt = pd.read_csv(f'../../data/{test_name}/PT_dataset_mapped.csv')
us = pd.read_csv(f'../../data/{test_name}/US_dataset_mapped.csv')

In [None]:
us.columns[us.isnull().any()]

In [None]:
dtypes = pd.concat([pt.dtypes, us.dtypes], axis=1)
dtypes.columns = ['PT', 'US']
different_dtypes = dtypes.loc[dtypes['PT'] != dtypes['US']]

In [None]:
us[us.columns[us.isnull().any()]]