### Convert raw data csv file to Structured Dataset csv file

In [17]:
# Import Libraries

import pandas
import sys
import numpy
import math

### Function to convert unstructured data to structured dataset

In [18]:
def prepare_dataset(bacteria_name, bacteria_short_name, antibiotic_name, datasets_info):
    # Read file from folder
    file_to_read = './raw_data_from_NCBI/' + antibiotic_name + '.csv'
    raw_data = pandas.read_csv(file_to_read)

    # Variables to track details about the dataset
    number_of_cols = 0
    resistant_isolates_count = 0
    susceptible_isolates_count = 0

    # String for resistant and susceptible as present in raw dataset
    resistant_str   = antibiotic_name + "=R"
    susceptible_str = antibiotic_name + "=S"

    # Dropping unwanted column
    unwanted_columns = ["Strain", "Isolate identifiers", "AMR genotypes core"]
    raw_data.drop(labels=unwanted_columns, inplace=True, axis=1)

    # Dropping unwanted rows
    raw_data.drop(raw_data[(raw_data['#Organism group'] != bacteria_name )].index, inplace=True)
    copy_raw_data = raw_data
    for index in raw_data.index:
        if not ((raw_data['AST phenotypes'][index]) != (raw_data['AST phenotypes'][index])):    
            if (raw_data['AST phenotypes'][index]).split(',').count(resistant_str) > 0:
                copy_raw_data['AST phenotypes'][index] = resistant_str
            elif (raw_data['AST phenotypes'][index]).split(',').count(susceptible_str) > 0:    
                copy_raw_data['AST phenotypes'][index] = susceptible_str
            else:
                copy_raw_data = copy_raw_data.drop(index)

    # Reset the index after dropping rows
    raw_data = copy_raw_data.reset_index(drop=True)

    # Creating new data frame to store structured data
    columns = ["Isolate"]
    structured_data = pandas.DataFrame(columns=columns)

    # Assigning Isolate column values to the new data frame
    for index in raw_data.index:
        structured_data.loc[index] = [raw_data["Isolate"][index]]        

    # Assigning all the new genes as columns in the new data frame
    for index in raw_data.index:
        if not ((raw_data["AMR genotypes"][index]) != (raw_data["AMR genotypes"][index])):
            for gene in (raw_data["AMR genotypes"][index]).split(","):
                if gene in structured_data.columns:
                    structured_data.at[index, gene] = 1
                else:
                    structured_data = structured_data.assign(**{gene: numpy.zeros(structured_data.shape[0])})
                    structured_data.at[index, gene] = 1
                    number_of_cols += 1

    # Assigning the AST phenotypes (target column) from raw dataset to new dataframe                    
    structured_data["AST phenotypes"] = raw_data["AST phenotypes"]

    # Converting String values in new data frame to integer binary values
    for index in structured_data.index:
        if structured_data["AST phenotypes"][index] == resistant_str:
            structured_data.at[index, "AST phenotypes"] = 1
            resistant_isolates_count += 1
        
        if structured_data["AST phenotypes"][index] == susceptible_str:
            structured_data.at[index, "AST phenotypes"] = 0
            susceptible_isolates_count += 1 

    # Converting float values to integer values for classification
    for i in structured_data.columns:
        try:
            structured_data[[i]] = structured_data[[i]].astype(float).astype(int)
        except:
            pass
    
    # Saving the structured data frame as a csv file if it has at least 25 data points
    if(len(structured_data) >= 25):
        file_path = "./datasets/"
        file_name = f"{bacteria_short_name}_{antibiotic_name}.csv"
        structured_data.rename_axis('index').to_csv(file_path+file_name)

        # Entering details and properties of the dataset into dataset_info csv file
        datasets_info.loc[len(datasets_info.index)] = [file_name, bacteria_name, antibiotic_name,  len(structured_data), number_of_cols, resistant_isolates_count, susceptible_isolates_count]
    


### Driver code to invoke the function with raw data as arguments

In [19]:
# Set of bacteria
bacteria_names = ["Klebsiella pneumoniae", "E.coli and Shigella", "Salmonella enterica", "Acinetobacter baumannii", "Campylobacter jejuni"]

# Set of corresponding bacteria names
bacteria_short_names = ["KP", "EcS", "SE", "AB", "CJ"]

# Set of antibiotic names
antibiotic_names = ["meropenem", "ciprofloxacin", "amoxicillin-clavulanic acid", "gentamicin", "imipenem", "ertapenem", "azithromycin"]

datasets_info = pandas.DataFrame(columns=["Dataset Name", "Bacteria_name", "Antibiotic Name", "Number of Rows", "Number of columns", "Resistant isolates count", "Susceptible isolates count"])

# Running all combinations of bacteria with the list of antibiotics to form datasets 
for i in range(0,len(bacteria_names)):
    for j in range(0, len(antibiotic_names)):
        prepare_dataset(bacteria_names[i], bacteria_short_names[i],antibiotic_names[j], datasets_info)

datasets_info.rename_axis('index').to_csv("datasets_info.csv")
