# Converting our generated data to vcf so it can be used with GNOMIX and LAI-NET

In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from my_knn_module.import_data import import_genome_data_as_df

## Importing our data

In [2]:
ag_df = import_genome_data_as_df(filename='WGAN.hapt', labels=None)
ag_df.head()

Unnamed: 0,Type,Sample,0,1,2,3,4,5,6,7,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,AG,AG0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,AG,AG1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,AG,AG2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AG,AG3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AG,AG4,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Using the paired by superpopulation .hapt because .vcf output requires 2 chromosomes 
ag_df = pd.read_csv('./generated/WGAN_paired.hapt', delimiter = ' ')
ag_df.head()

Unnamed: 0,Type,Sample,0,1,2,3,4,5,6,7,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,AG,AG0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,AG,AG18,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,AG,AG1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,AG,AG2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AG,AG3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data_dim = ag_df.shape
print(ag_df.shape, "\n")
print("We can see that we've got {} SNIPs and {} haplotypes.".format(data_dim[1] - 2, data_dim[0]))

(5006, 10002) 

We can see that we've got 10000 SNIPs and 5006 haplotypes.


## First we create the **.sample** file
This file should contain (number of haplotype)/2 entries.
Here that number is 2503.

This file also holds our inferred ancestry information.

In [5]:
samples_df = pd.DataFrame(ag_df["Sample"].copy())
samples_df["population"] = ag_df["Sample"]
samples_df["group"] = ag_df["Sample"]
samples_df["sex"] = 1 # Set it to 1 because we have no information on sex

samples_df = samples_df.drop(index=[i for i in range(int(data_dim[0]/2), data_dim[0])]) # We drop half of the values because .vcf needs full genotypes and we are working with haplotypes 

samples_df.to_csv("./generated/ag.sample", index=False, sep=" ")

print(samples_df.shape, "\n")
samples_df.head()

(2503, 4) 



Unnamed: 0,Sample,population,group,sex
0,AG0,AG0,AG0,1
1,AG18,AG18,AG18,1
2,AG1,AG1,AG1,1
3,AG2,AG2,AG2,1
4,AG3,AG3,AG3,1


## Then we convert our data to **.hap**

In [6]:
ag_df = ag_df.drop(columns=['Type', 'Sample'])
ag_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
ag_df = ag_df.to_numpy().transpose() # .hap is, simply put, the transpose of our .hapt data
ag_df = pd.DataFrame(ag_df)

ag_df.to_csv("./generated/ag.hap", header=False, index=False, sep=" ")

print(ag_df.shape, "\n")
ag_df.head()

(10000, 5006) 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4996,4997,4998,4999,5000,5001,5002,5003,5004,5005
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Finaly we create our **.legend** file
It's created from an existing vcf files of real data used to train our model that was converted into a **.hap** + **.legend** using *bcftools*.
This file needs as many entries as there are SNIPs.

    ./bcftools convert --haplegendsample

In [8]:
label_df = pd.read_csv('./data/10k.hap', delimiter = ' ', header=None)
label_df = label_df.drop(columns=[i for i in range(5, 5013)])
label_df.rename(columns={0: 'Chromo num', 1: 'id', 2: 'position', 3: 'a0', 4: 'a1'}, inplace=True)
label_df.drop(columns=["Chromo num"], inplace=True)

label_df.to_csv("./generated/ag.legend", index=False, sep=" ")

print(label_df.shape, "\n")
label_df.head()

(10000, 4) 



Unnamed: 0,id,position,a0,a1
0,15:27379578_C_A,27379578,C,A
1,15:27379592_T_A,27379592,T,A
2,15:27379787_C_T,27379787,C,T
3,15:27379947_C_T,27379947,C,T
4,15:27380842_A_G,27380842,A,G


## With all of our files
We can then use bcftools to convert these three files into a **.vcf** file.

To achieve this we use the following command from *bcftools*:

    ./bcftools convert --haplegendsample2vcf [.hap],[.legend],[.sample] -o [.vcf]

## Creating **.smap**
This will be used by the model

In [9]:
smap_df = pd.read_csv('./generated/WGAN.tsv', delimiter = ',')
smap_df.head()

Unnamed: 0,Sample,Superpopulation code
0,AG0,AMR
1,AG1,EAS
2,AG2,EAS
3,AG3,EAS
4,AG4,AFR


In [11]:
smap_df.to_csv("./generated/WGAN.smap", sep=" ", index=False, header=False)