### Convert raw data csv file to Structured Dataset csv file

In [3]:
import pandas
import sys
import numpy
import math

In [14]:
def prepare_dataset(bacteria_name, bacteria_short_name, antibiotic_name):
    file_to_read = './raw_data_from_NCBI/' + antibiotic_name + '.csv'
    raw_data = pandas.read_csv(file_to_read)

    # Dropping unwanted column
    unwanted_columns = ["Strain", "Isolate identifiers", "AMR genotypes core"]
    raw_data.drop(labels=unwanted_columns, inplace=True, axis=1)

    # Dropping unwanted rows
    raw_data.drop(raw_data[(raw_data['#Organism group'] != bacteria_name )].index, inplace=True)
    copy_raw_data = raw_data
    for index in raw_data.index:
        if not ((raw_data['AST phenotypes'][index]) != (raw_data['AST phenotypes'][index])):    
            if (raw_data['AST phenotypes'][index]).split(',').count("meropenem=R") > 0:
                copy_raw_data['AST phenotypes'][index] = "meropenem=R"
            elif (raw_data['AST phenotypes'][index]).split(',').count("meropenem=S") > 0:    
                copy_raw_data['AST phenotypes'][index] = "meropenem=S"
            else:
                copy_raw_data = copy_raw_data.drop(index)


    raw_data = copy_raw_data.reset_index(drop=True)

    columns = ["Isolate"]
    structured_data = pandas.DataFrame(columns=columns)

    for index in raw_data.index:
        structured_data.loc[index] = [raw_data["Isolate"][index]]        

    for index in raw_data.index:
        if not ((raw_data["AMR genotypes"][index]) != (raw_data["AMR genotypes"][index])):
            for gene in (raw_data["AMR genotypes"][index]).split(","):
                if gene in structured_data.columns:
                    structured_data.at[index, gene] = 1
                else:
                    structured_data = structured_data.assign(**{gene: numpy.zeros(structured_data.shape[0])})
                    structured_data.at[index, gene] = 1
                    
    structured_data["AST phenotypes"] = raw_data["AST phenotypes"]

    for index in structured_data.index:
        if structured_data["AST phenotypes"][index] == "meropenem=R":
            structured_data.at[index, "AST phenotypes"] = 1
        
        if structured_data["AST phenotypes"][index] == "meropenem=S":
            structured_data.at[index, "AST phenotypes"] = 0

    file_path = "./datasets/"
    file_name = f"{bacteria_short_name}_{antibiotic_name}.csv"
    structured_data.to_csv(file_path+file_name)


    

    


In [15]:
bacteria_names = ["Klebsiella pneumoniae", "E.coli and Shigella", "Salmonella enterica", "Acinetobacter baumannii", "Campylobacter jejuni"]
bacteria_short_names = ["KP", "EcS", "SE", "AB", "CJ"]
antibiotic_names = ["meropenem", "ciprofloxacin", "amoxicillin-clavulanic acid"]

for i in range(0,len(bacteria_names)):
    for j in range(0, len(antibiotic_names)):
        prepare_dataset(bacteria_names[i], bacteria_short_names[i],antibiotic_names[j])

