# Load Data + Abridged Data Cleaning

This is a condensed/simplified data processing code from our Part 1. Visualizations have been removed along with steps we found to be un-useful for modeling. We utilize a set of features that we discovered to have the greatest correlations with our labels (from our EDA portion in part 1).

In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



metadata = pd.read_csv('metadata.csv')
metadata.set_index('Sample_ID',inplace=True)


def impute_cols_by_skew(df, columns):
    for column_name in columns:
        sr_skew = df[column_name].skew()
        impute_value = None
        if sr_skew > 0:
            impute_value = df[column_name].mean()
        elif sr_skew < 0:
            impute_value = df[column_name].median()
        df[column_name].fillna(impute_value, inplace=True)

#metadata.dropna(axis=0, how='any', inplace=True, subset=['azm_sr', 'cfx_sr', 'cip_sr'])
metadata[['azm_sr', 'cfx_sr', 'cip_sr']] = metadata[['azm_sr', 'cfx_sr', 'cip_sr']].fillna(0.0)

useless_columns = ['Year', 'cro_sr', 'tet_sr', 'pen_sr']

metadata.drop(labels=useless_columns, axis=1, inplace=True)

metadata.drop_duplicates(inplace=True)

def remove_symbols_from_column(col):
    if col.dtype == object:  #check if strong
        return col.str.replace(r'[^a-zA-Z0-9]', '', regex=True)
    else:  #do nothing
        return col

metadata = metadata.apply(remove_symbols_from_column)

numeric_columns = [
'Azithromycin',
'Ciprofloxacin',
'Ceftriaxone',
'Cefixime',
'Tetracycline',
'Penicillin',
'NG_MAST',
'Group',
'azm_mic',
'cip_mic',
'cro_mic',
'cfx_mic',
'tet_mic',
'pen_mic',
'log2_azm_mic',
'log2_cip_mic',
'log2_cro_mic',
'log2_cfx_mic',
'log2_tet_mic',
'log2_pen_mic'

]

for column in numeric_columns:
    metadata[column] = pd.to_numeric(metadata[column], errors='coerce', downcast="float") #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html

metadata = pd.get_dummies(data=metadata, prefix="Encoded_Beta.lactamase", columns=['Beta.lactamase'], dtype=float)


train_inputs,test_inputs = train_test_split(metadata, test_size=0.20, random_state=42)   # partition and mix entries
#print(train_inputs.shape,":", test_inputs.shape)

targets = numeric_columns
impute_cols_by_skew(train_inputs, targets)
impute_cols_by_skew(test_inputs, targets)


normalizer = MinMaxScaler()

train_inputs[numeric_columns] = normalizer.fit_transform(train_inputs[numeric_columns])
test_inputs[numeric_columns] = normalizer.fit_transform(test_inputs[numeric_columns])




#Best columns obtained from EDA step
best_columns = [
    'Group', 'azm_mic', 'log2_azm_mic', 'log2_cip_mic', 'cip_mic', 'cro_mic', 
    'cfx_mic', 'log2_cro_mic', 'log2_cfx_mic', 'Ciprofloxacin', 'Cefixime', 
    'Tetracycline', 'Penicillin', 'log2_tet_mic', 'log2_pen_mic'
]

label_columns = ['azm_sr', 'cfx_sr', 'cip_sr']



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column_name].fillna(impute_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column_name].fillna(impute_value, inplace=True)


Unnamed: 0_level_0,Country,Continent,Azithromycin,Ciprofloxacin,Ceftriaxone,Cefixime,Tetracycline,Penicillin,NG_MAST,Group,...,log2_tet_mic,log2_pen_mic,azm_sr,cip_sr,cfx_sr,Encoded_Beta.lactamase_0,Encoded_Beta.lactamase_1,Encoded_Beta.lactamase_2,Encoded_Beta.lactamase_R,Encoded_Beta.lactamase_S
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16933_2#60,Slovakia,Europe,0.000556,0.011905,1e-06,0.120968,0.057099,0.144674,0.731126,0.439703,...,0.526664,0.535567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16933_2#61,Hungary,Europe,0.000556,0.069711,1.571429e-06,0.120968,0.057099,0.144674,0.747715,0.460111,...,0.526664,0.535567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10625_6#93,Japan,Asia,9.4e-05,0.184524,1.771429e-05,0.298387,0.007843,0.023622,0.200149,0.189239,...,0.571353,0.68683,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8727_5#76,USA,America,5.8e-05,0.083333,1e-06,0.112903,0.015686,0.188976,0.279045,0.000928,...,0.398628,0.436294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17428_6#28,Portugal,Europe,0.000179,0.011905,2.857143e-07,0.120968,0.057099,0.144674,0.293046,0.789425,...,0.526664,0.535567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Our cleaned data for modelling

In [15]:
train_labels = train_inputs[label_columns]
test_labels = test_inputs[label_columns]

filtered_train = train_inputs[best_columns]
filtered_test = test_inputs[best_columns]



filtered_train.head()
train_labels.head()

Unnamed: 0_level_0,azm_sr,cfx_sr,cip_sr
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16933_2#60,0.0,0.0,0.0
16933_2#61,0.0,0.0,0.0
10625_6#93,0.0,0.0,1.0
8727_5#76,1.0,0.0,0.0
17428_6#28,0.0,0.0,0.0


(2490, 98) : (623, 98)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column_name].fillna(impute_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column_name].fillna(impute_value, inplace=True)


Unnamed: 0_level_0,Azithromycin,Ciprofloxacin,Ceftriaxone,Cefixime,Tetracycline,Penicillin,NG_MAST,Group,azm_mic,cip_mic,...,Encoded_Africa,Encoded_America,Encoded_Asia,Encoded_Europe,Encoded_Oceania,Encoded_Beta.lactamase_0,Encoded_Beta.lactamase_1,Encoded_Beta.lactamase_2,Encoded_Beta.lactamase_R,Encoded_Beta.lactamase_S
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16933_2#60,0.000556,0.011905,1e-06,0.120968,0.057099,0.144674,0.731126,0.439703,0.007431,3.1e-05,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
16933_2#61,0.000556,0.069711,1.571429e-06,0.120968,0.057099,0.144674,0.747715,0.460111,0.007431,0.116224,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10625_6#93,9.4e-05,0.184524,1.771429e-05,0.298387,0.007843,0.023622,0.200149,0.189239,0.000229,0.499992,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8727_5#76,5.8e-05,0.083333,1e-06,0.112903,0.015686,0.188976,0.279045,0.000928,0.015611,0.000219,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17428_6#28,0.000179,0.011905,2.857143e-07,0.120968,0.057099,0.144674,0.293046,0.789425,3.1e-05,3.1e-05,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Extract important features
Impactful features for azm_sr
 * Group: Correlation = 0.09546140994798231
 * azm_mic: Correlation = 0.33309874367227105
 * log2_azm_mic: Correlation = 0.7249744667358493
 * log2_cip_mic: Correlation = -0.051520854139286525
Impactful features for cfx_sr
 * cip_mic: Correlation = 0.14119023870077058
 * cro_mic: Correlation = 0.8369508121198919
 * cfx_mic: Correlation = 0.8257728774892811
 * log2_cip_mic: Correlation = 0.06215456778249806
 * log2_cro_mic: Correlation = 0.1663635475783556
 * log2_cfx_mic: Correlation = 0.1894199429007991
Impactful features for cip_sr
 * Ciprofloxacin: Correlation = 0.2056506959431126
 * Cefixime: Correlation = 0.23242760949223804
 * Tetracycline: Correlation = -0.08755912049818396
 * Penicillin: Correlation = -0.09237877719385387
 * Group: Correlation = -0.1286073397391039
 * cip_mic: Correlation = 0.6896004321830833
 * cro_mic: Correlation = 0.19410381853153916
 * cfx_mic: Correlation = 0.16295955814421442
 * log2_azm_mic: Correlation = 0.0878175933781791
 * log2_cip_mic: Correlation = 0.9447762983474318
 * log2_cro_mic: Correlation = 0.5225341989836155
 * log2_cfx_mic: Correlation = 0.42873547331061773
 * log2_tet_mic: Correlation = 0.2315851892159167
 * log2_pen_mic: Correlation = 0.2175450093290776