# Load Data

In [10]:
import pandas as pd 

# columns correspond to unitigs so we must transpose this table
def load_unitig_data(path) -> pd.DataFrame:
    sr = pd.read_csv(path, sep=' ')
    sr.set_index('pattern_id',inplace=True)
    return sr.T[1:]

azm_sr = load_unitig_data("azm_sr_gwas_filtered_unitigs.Rtab")
cfx_sr = load_unitig_data("cfx_sr_gwas_filtered_unitigs.Rtab")
cip_sr = load_unitig_data("cip_sr_gwas_filtered_unitigs.Rtab")

metadata = pd.read_csv('metadata.csv')
metadata.set_index('Sample_ID',inplace=True)


### Visualization before cleaning

In [12]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3786 entries, ERR1549286 to ERR2172354
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            3536 non-null   float64
 1   Country         3785 non-null   object 
 2   Continent       3785 non-null   object 
 3   Beta.lactamase  1927 non-null   object 
 4   Azithromycin    3480 non-null   object 
 5   Ciprofloxacin   3129 non-null   object 
 6   Ceftriaxone     3436 non-null   object 
 7   Cefixime        3405 non-null   object 
 8   Tetracycline    1472 non-null   object 
 9   Penicillin      1465 non-null   object 
 10  NG_MAST         3779 non-null   object 
 11  Group           3786 non-null   int64  
 12  azm_mic         3478 non-null   float64
 13  cip_mic         3088 non-null   float64
 14  cro_mic         3434 non-null   float64
 15  cfx_mic         3401 non-null   float64
 16  tet_mic         1472 non-null   float64
 17  pen_mic         1465 no

# Null/NA Cleaning

Note to Jacob:
If I remove all rows with nulls, then we drop down to ~1k entries. Not ideal. I will remove only the nulls in the target labels. By doing that I was able to preserve ~2800 entries.

Additionally, I can technically replace the nulls in the feature set with averages whether they are continuous or discrete, but since we are going to be building some kind of predictive model later, it would be bad practice to run column averages in df.fillna() before we split our dataset into training and test sets.

Therefore, I am going to split the dataset into a training and test set first.

What do you think?

In [13]:
metadata.dropna(axis=0, how='any', inplace=True, subset=['azm_sr', 'cfx_sr', 'cip_sr'])

### Visualization after Null Cleaning.

In [14]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2814 entries, SRR1661154 to SRR5827370
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            2768 non-null   float64
 1   Country         2813 non-null   object 
 2   Continent       2813 non-null   object 
 3   Beta.lactamase  1865 non-null   object 
 4   Azithromycin    2814 non-null   object 
 5   Ciprofloxacin   2814 non-null   object 
 6   Ceftriaxone     2769 non-null   object 
 7   Cefixime        2814 non-null   object 
 8   Tetracycline    1266 non-null   object 
 9   Penicillin      1264 non-null   object 
 10  NG_MAST         2814 non-null   object 
 11  Group           2814 non-null   int64  
 12  azm_mic         2814 non-null   float64
 13  cip_mic         2814 non-null   float64
 14  cro_mic         2769 non-null   float64
 15  cfx_mic         2814 non-null   float64
 16  tet_mic         1266 non-null   float64
 17  pen_mic         1264 no

In [17]:
metadata.head(10)

Unnamed: 0_level_0,Year,Country,Continent,Beta.lactamase,Azithromycin,Ciprofloxacin,Ceftriaxone,Cefixime,Tetracycline,Penicillin,...,log2_cro_mic,log2_cfx_mic,log2_tet_mic,log2_pen_mic,azm_sr,cip_sr,cro_sr,cfx_sr,tet_sr,pen_sr
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR1661154,1996.0,Canada,America,,0.25,0.5,0.125,0.125,2,2.0,...,-3.0,-3.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
SRR1661156,1997.0,Canada,America,,0.25,16.0,0.125,0.063,2,2.0,...,-3.0,-3.988504,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
SRR1661157,1998.0,Canada,America,,0.125,0.016,0.125,0.063,>=128,4.0,...,-3.0,-3.988504,8.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0
SRR1661158,2000.0,Canada,America,,0.25,0.5,0.125,0.25,4,4.0,...,-3.0,-2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0
SRR1661159,2001.0,Canada,America,,0.063,0.063,0.00025,0.001,0.5,0.008,...,-11.965784,-9.965784,-1.0,-6.965784,0.0,0.0,0.0,0.0,0.0,0.0
SRR1661160,2001.0,Canada,America,,0.125,0.004,0.002,0.004,1,0.063,...,-8.965784,-7.965784,0.0,-3.988504,0.0,0.0,0.0,0.0,0.0,0.0
SRR1661161,2001.0,Canada,America,,0.5,16.0,0.125,0.25,4,4.0,...,-3.0,-2.0,2.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0
SRR1661162,2001.0,Canada,America,,0.063,0.004,0.004,0.008,0.125,0.063,...,-7.965784,-6.965784,-3.0,-3.988504,0.0,0.0,0.0,0.0,0.0,0.0
SRR1661163,2001.0,Canada,America,,0.5,0.032,0.125,0.032,4,4.0,...,-3.0,-4.965784,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0
SRR1661164,2001.0,Canada,America,,0.5,64.0,0.125,0.25,4,4.0,...,-3.0,-2.0,2.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0


# Re-engineer features if possible

### Data Stats

In [15]:
j=[0,0,0]
samples = metadata.index
for sample in samples:
    if metadata['azm_sr'][sample]:
        j[0] +=1
    if metadata['cfx_sr'][sample]:
        j[1] +=1
    if metadata['cip_sr'][sample]:
        j[2] +=1

print(j[0]/len(samples), "% of samples have resistance to azm")
print(j[1]/len(samples), "% of samples have resistance to cfx")
print(j[2]/len(samples), "% of samples have resistance to cip")


0.05650319829424307 % of samples have resistance to azm
0.0017768301350390902 % of samples have resistance to cfx
0.4541577825159915 % of samples have resistance to cip


In [24]:
samples = azm_sr.index

# some random unitig from azm_sr

from random import randint
randomUnitig = azm_sr.columns[randint(0,azm_sr.shape[1])]
j = 0
print(azm_sr.shape)
for i in range(azm_sr.shape[0]):
    if azm_sr[randomUnitig][samples[i]]:
        j += 1

print(randomUnitig,"\npresent in", j/azm_sr.shape[0],"% of azm_sr samples (",j,"/",azm_sr.shape[0],')' )

IndexError: index 554 is out of bounds for axis 0 with size 515