# Load Data

In [2]:
import pandas as pd 

# columns correspond to unitigs so we must transpose this table
def load_unitig_data(path) -> pd.DataFrame:
    sr = pd.read_csv(path, sep=' ')
    sr.set_index('pattern_id',inplace=True)
    return sr.T[1:]

azm_sr = load_unitig_data("azm_sr_gwas_filtered_unitigs.Rtab")
cfx_sr = load_unitig_data("cfx_sr_gwas_filtered_unitigs.Rtab")
cip_sr = load_unitig_data("cip_sr_gwas_filtered_unitigs.Rtab")

metadata = pd.read_csv('metadata.csv')
metadata.set_index('Sample_ID',inplace=True)


In [3]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3786 entries, ERR1549286 to ERR2172354
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            3536 non-null   float64
 1   Country         3785 non-null   object 
 2   Continent       3785 non-null   object 
 3   Beta.lactamase  1927 non-null   object 
 4   Azithromycin    3480 non-null   object 
 5   Ciprofloxacin   3129 non-null   object 
 6   Ceftriaxone     3436 non-null   object 
 7   Cefixime        3405 non-null   object 
 8   Tetracycline    1472 non-null   object 
 9   Penicillin      1465 non-null   object 
 10  NG_MAST         3779 non-null   object 
 11  Group           3786 non-null   int64  
 12  azm_mic         3478 non-null   float64
 13  cip_mic         3088 non-null   float64
 14  cro_mic         3434 non-null   float64
 15  cfx_mic         3401 non-null   float64
 16  tet_mic         1472 non-null   float64
 17  pen_mic         1465 no

In [4]:
metadata.head(5)

Unnamed: 0_level_0,Year,Country,Continent,Beta.lactamase,Azithromycin,Ciprofloxacin,Ceftriaxone,Cefixime,Tetracycline,Penicillin,...,log2_cro_mic,log2_cfx_mic,log2_tet_mic,log2_pen_mic,azm_sr,cip_sr,cro_sr,cfx_sr,tet_sr,pen_sr
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR1549286,2015.0,UK,Europe,,>256,,0.016,,,,...,-5.965784,,,,1.0,,0.0,,,
ERR1549290,2015.0,UK,Europe,,>256,,0.004,,,,...,-7.965784,,,,1.0,,0.0,,,
ERR1549291,2015.0,UK,Europe,,>256,,0.006,,,,...,-7.380822,,,,1.0,,0.0,,,
ERR1549287,2015.0,UK,Europe,,>256,,0.006,,,,...,-7.380822,,,,1.0,,0.0,,,
ERR1549288,2015.0,UK,Europe,,>256,,0.008,,,,...,-6.965784,,,,1.0,,0.0,,,


    # Null/NA Cleaning in Labels

1. Remove rows with NaN in the labels we are trying to predict: 'azm_sr','cfx_sr', 'cip_sr'

---

Note to Jacob:
If I remove all rows with nulls, then we drop down to ~1k entries. Not ideal. I will remove only the nulls in the target labels. By doing that I was able to preserve ~2800 entries.

Additionally, I can technically replace the nulls in the feature set with averages whether they are continuous or discrete, but since we are going to be building some kind of predictive model later, it would be bad practice to run column averages in df.fillna() before we split our dataset into training and test sets.

Therefore, I am going to split the dataset into a training and test set first.

What do you think?

-Jacob
We could evaluate how to fill the NaN values based on how the data is skewed for each feature we want to predict. 
if skew > 0 -> fill NaN with mean .. there are more 0s than 1s, mean will reflect the distribution of 0s and 1s
else if skew < 0 -> fill NaN with median .. there are more 1s than 0s, using the median ensures our negatively skewed data wont be affected by an uneven distribution of 0s and 1s when we fill it


In [5]:

def impute_cols_by_skew(df, columns):
    for column_name in columns:
        sr_skew = df[column_name].skew()
        impute_value = None
        if sr_skew > 0:
            impute_value = df[column_name].mean()
        elif sr_skew < 0:
            impute_value = df[column_name].median()
        df[column_name].fillna(impute_value, inplace=True)

#metadata.dropna(axis=0, how='any', inplace=True, subset=['azm_sr', 'cfx_sr', 'cip_sr'])

It looks like they're all positively skewed so we can use mean values, lets try continuous for now and see what happens. It was worth trying this though

In [6]:
metadata.head(5)

Unnamed: 0_level_0,Year,Country,Continent,Beta.lactamase,Azithromycin,Ciprofloxacin,Ceftriaxone,Cefixime,Tetracycline,Penicillin,...,log2_cro_mic,log2_cfx_mic,log2_tet_mic,log2_pen_mic,azm_sr,cip_sr,cro_sr,cfx_sr,tet_sr,pen_sr
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR1549286,2015.0,UK,Europe,,>256,,0.016,,,,...,-5.965784,,,,1.0,,0.0,,,
ERR1549290,2015.0,UK,Europe,,>256,,0.004,,,,...,-7.965784,,,,1.0,,0.0,,,
ERR1549291,2015.0,UK,Europe,,>256,,0.006,,,,...,-7.380822,,,,1.0,,0.0,,,
ERR1549287,2015.0,UK,Europe,,>256,,0.006,,,,...,-7.380822,,,,1.0,,0.0,,,
ERR1549288,2015.0,UK,Europe,,>256,,0.008,,,,...,-6.965784,,,,1.0,,0.0,,,


# Removing Un-used labels (Ground truth)
2. Removing 'Year', and unimportant labels 'cro_sr', 'tet_sr', 'pen_sr'

In [7]:
useless_columns = ['Year', 'cro_sr', 'tet_sr', 'pen_sr']

metadata.drop(labels=useless_columns, axis=1, inplace=True)

In [8]:
metadata.columns

Index(['Country', 'Continent', 'Beta.lactamase', 'Azithromycin',
       'Ciprofloxacin', 'Ceftriaxone', 'Cefixime', 'Tetracycline',
       'Penicillin', 'NG_MAST', 'Group', 'azm_mic', 'cip_mic', 'cro_mic',
       'cfx_mic', 'tet_mic', 'pen_mic', 'log2_azm_mic', 'log2_cip_mic',
       'log2_cro_mic', 'log2_cfx_mic', 'log2_tet_mic', 'log2_pen_mic',
       'azm_sr', 'cip_sr', 'cfx_sr'],
      dtype='object')

# Remove Duplicates

3. Removing Duplicate entries in the dataframe

In [9]:
metadata.drop_duplicates(inplace=True)

# Cleaning non-numeric entries in numeric fields to NaN.

4. Turn Non Numeric Entries in Numeric Columns into NaN
5. Cast all numeric rows into float32
---


Notes: Turning them into NaN for now. Will engineer values for all NaNs after train and test splits are made

In [73]:
numeric_columns = [
'Azithromycin',
'Ciprofloxacin',
'Ceftriaxone',
'Cefixime',
'Tetracycline',
'Penicillin',
'NG_MAST',
'Group',
'azm_mic',
'cip_mic',
'cro_mic',
'cfx_mic',
'tet_mic',
'pen_mic',
'log2_azm_mic',
'log2_cip_mic',
'log2_cro_mic',
'log2_cfx_mic',
'log2_tet_mic',
'log2_pen_mic',
'azm_sr',
'cip_sr',
'cfx_sr',
]

for column in numeric_columns:
    metadata[column] = pd.to_numeric(metadata[column], errors='coerce', downcast="float") #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html

In [None]:
metadata.info()

In [None]:
metadata.head(10)

# One Hot Encode Categorical Columns

6. Turn Categorical Location entries into numerical representation https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
7. Handle 'Beta.lactamase' special case, as it is discrete with many NaNs. Cannot be engineered in the same way as continuous features. One hot encode like the other discrete features but set to all 0 if NaN

In [76]:
geographic_columns=['Country', 'Continent']
metadata = pd.get_dummies(data=metadata, prefix="Encoded", columns=geographic_columns, dtype=float)
metadata = pd.get_dummies(data=metadata, prefix="Encoded_Beta.lactamase", columns=['Beta.lactamase'], dtype=float)


In [None]:
metadata.head(5)

In [None]:
metadata.info()

# Split Dataframe into Train and Test

8. To ensure that we can do normalization later as it is good practice to split before normalization

In [None]:
from sklearn.model_selection import train_test_split
train_inputs,test_inputs = train_test_split(metadata, test_size=0.20, random_state=42)   # partition and mix entries
print(train_inputs.shape,":", test_inputs.shape)

# Engineer/Impute Values for NaN numerical values

9. Fill out gaps with skew imputation in numeric columns so we have more data by avoiding dropping all rows with NaN

In [None]:
# apply skew based imputation

targets = numeric_columns
impute_cols_by_skew(train_inputs, targets)
impute_cols_by_skew(test_inputs, targets)




In [None]:
train_inputs.head(5)

# Normalize Numerical Features 

10. Scale all numerical columns' values to be between 0 and 1

In [None]:
#from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()

train_inputs[numeric_columns] = normalizer.fit_transform(train_inputs[numeric_columns])
test_inputs[numeric_columns] = normalizer.fit_transform(test_inputs[numeric_columns])

train_inputs.head(5)

### Data Stats

In [None]:
j=[0,0,0]
samples = metadata.index
for sample in samples:
    if metadata['azm_sr'][sample]:
        j[0] +=1
    if metadata['cfx_sr'][sample]:
        j[1] +=1
    if metadata['cip_sr'][sample]:
        j[2] +=1

print(j[0]/len(samples), "% of samples have resistance to azm")
print(j[1]/len(samples), "% of samples have resistance to cfx")
print(j[2]/len(samples), "% of samples have resistance to cip")


In [None]:
samples = azm_sr.index

# some random unitig from azm_sr

from random import randint
randomUnitig = azm_sr.columns[randint(0,azm_sr.shape[1])]
j = 0
print(azm_sr.shape)
for i in range(azm_sr.shape[0]):
    if azm_sr[randomUnitig][samples[i]]:
        j += 1

print(randomUnitig,"\npresent in", j/azm_sr.shape[0],"% of azm_sr samples (",j,"/",azm_sr.shape[0],')' )

In [None]:
train_inputs.info()

In [None]:
test_inputs.info()