In [2]:
import pandas as pd

<h1>Importing data from <i>.hapt</i> files with labels from <i>.tsv</i> and storing in a pandas dataframe</h1>
 All of the data used is stored in the <i>./data/</i> directory. The data is imported then stored in a pandas dataframe where
 the first row is the type of genome (Real or Artificial), the second row is the name of the sample, the next rows correponds to
 the SNPs (0: default state, 1: mutation) and the last row is the real labels from a <i>.tsv</i> file if we posess that information.

In [3]:
filename = "REAL.hapt"
labels = "REAL.tsv"

In [4]:
# Data is read from a CSV
df = pd.read_csv('./data/' + filename, delimiter = ' ')
if labels:
    labels_df = pd.read_csv('./data/' + labels, header = 0, sep = '\t')

    # We convert string superpopulation values to integers for simpler processing
    superpopulation_code_dict = {
        'EUR': 0,
        'EAS': 1,
        'AMR': 2,
        'SAS': 3,
        'AFR': 4,
        'EUR,AFR': 5,
    }
    
    # We assemble our labels and SNP tables and merge the dupplicate rows
    df.iloc[:, 1] = df.iloc[:, 1].astype(str).str[:-2]
    df = df.merge(labels_df[['Sample name','Superpopulation code']], left_on=df.iloc[:, 1], right_on='Sample name', how='inner')
    df['Superpopulation code'].replace(superpopulation_code_dict, inplace=True)
    df = df.drop(columns='Sample name')

In [5]:
df.head()

Unnamed: 0,Type,Sample,0,1,2,3,4,5,6,7,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,Superpopulation code
0,Real,HG00096,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Real,HG00096,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Real,HG00097,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Real,HG00097,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Real,HG00099,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


<h2>Making it simple</h2>
To make the process of importing data simpler for our other notebooks we moved our import function into a module.

In [9]:
from my_knn_module.import_data import import_genome_data_as_df

In [10]:
import_genome_data_as_df().head()

Unnamed: 0,Type,Sample,0,1,2,3,4,5,6,7,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,Superpopulation code
0,Real,HG00096,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Real,HG00096,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Real,HG00097,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Real,HG00097,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Real,HG00099,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
