In [6]:
from datetime import datetime
import argparse
import math
import numpy as np
import os ; import sys
import subprocess
import pandas as pd
import vcf

VCFDIR = "/storage/szfeupe/Runs/650GTEx_estr/VCFs/"
tmp = '/storage/szfeupe/Runs/650GTEx_estr/temp_files/'
DEBUG = False
VCFLIST = "/storage/szfeupe/Runs/650GTEx_estr/population"
OUTFILE = "/storage/szfeupe/Runs/650GTEx_estr/VCFs/Merged_All_Samples_STRs.vcf"


def PROGRESS(msg):
    sys.stderr.write("%s\n"%msg.strip())

def combine(X):
    X['LIST']=X.astype(str).apply(lambda x: ','.join(x), axis=1)
    X['L']=X['LIST'].astype(str).apply(lambda x: ','.join([x for x in set(x.strip().split(',')) if str(x)!= 'nan']))
    return(list(X['L']))

def Mergesamples(X):
    S = list(X.keys())
    MG = X[S[0]][["#CHROM","POS","ID","REF","ALT","QUAL","FILTER", "FORMAT","U.size","END"]]
    for s in S:
        print(s,X[s].shape)
        MG = pd.merge(MG, X[s], how='outer', on=["#CHROM","POS","ID","ALT","QUAL","FILTER", "FORMAT","U.size","END"])
    ref = [x for x in list(MG.columns) if x[:3]=='REF']; ref=list(set(ref))
    n=len(ref)
    MG['combined_ref']=combine(MG[ref])
    MG['combined_info']=combine(MG[list(set([x for x in list(MG.columns) if x[:4]=='INFO']))])
    for x in ref:
        del MG[x]
    return(MG)

def Getgb(record):
    spl = record.samples
    if spl[0]['GB'] is None:
        gb='.'
    else:
        gb=spl[0]['GB']
    geno = ':'.join(['./.', gb])
    return(geno)

In [2]:
vcfs = open(VCFLIST,'r').readlines()
vcfs = [VCFDIR+s.strip() for s in vcfs]
attrib = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER", "INFO","FORMAT"]

In [3]:
    SAMPLE=[]
    vcf_table={}
    PROGRESS("There are %s Samples to be merged... "%str(len(vcfs)))
    PROGRESS('Begin ...  %s'%str(datetime.now().strftime('%H:%M:%S')))
    for VCF in vcfs[:8]:
        Spl = vcf.Reader(filename=VCF).samples[0]
        if Spl in os.listdir(tmp):
            df=pd.read_csv(tmp+Spl, sep='\t', low_memory=False)
            PROGRESS('****Sample %s has been formatted already. Moving on...'%Spl )
            vcf_table[Spl]=df
            continue
        vcf_reader = vcf.Reader(filename=VCF)
        PROGRESS('**VCF Opened ... %s'%str(datetime.now().strftime('%H:%M:%S')))
        geno = [Getgb(record) for record in vcf.Reader(filename=VCF) ]
        Pos = [record.INFO['START'] for record in vcf.Reader(filename=VCF)]
        CH = [record.CHROM for record in vcf.Reader(filename=VCF)]
        Ref = [record.REF for record in vcf.Reader(filename=VCF)]
        End = [record.INFO['END'] for record in vcf.Reader(filename=VCF)]
        Alt = ["."]*len(CH)
        Qual= ["."]*len(CH)
        Fil = ["."]*len(CH)
        Fmt = ["GT:GB"]*len(CH)
        Id = [record.ID for record in vcf.Reader(filename=VCF)]
        Info = [';'.join(['PERIOD='+str(record.INFO['PERIOD']), 'START='+str(record.INFO['START']), 'END='+str(record.INFO['END'])]) for record in vcf.Reader(filename=VCF)] 
        Unit = [str(record.INFO['PERIOD']) for record in vcf.Reader(filename=VCF)]
        df = pd.DataFrame({"#CHROM":CH,"POS":Pos,"ID":Id,"REF":Ref ,"ALT":Alt,"QUAL":Qual,"FILTER":Fil, "INFO":Info,"FORMAT":Fmt, Spl:geno, 'U.size':Unit , 'END':End})
        vcf_table[Spl]=df 
        df[attrib+[Spl,"U.size","END"]].to_csv(tmp+Spl,sep='\t', index=None)
    # Output
    ##Merging
    PROGRESS(".\nMerging samples\n%s"%str(datetime.now().strftime('%H:%M:%S')))
   

There are 20 Samples to be merged...
Begin ...  09:07:57
****Sample GTEX-PLZ4-0003 has been formatted already. Moving on...
****Sample GTEX-NFK9-0004 has been formatted already. Moving on...
****Sample GTEX-OHPM-0003 has been formatted already. Moving on...
****Sample GTEX-X4EO-0003 has been formatted already. Moving on...
****Sample GTEX-UTHO-0003 has been formatted already. Moving on...
****Sample GTEX-TMZS-0002 has been formatted already. Moving on...
****Sample GTEX-WY7C-0002 has been formatted already. Moving on...
****Sample GTEX-P44H-1126 has been formatted already. Moving on...
.
Merging samples
09:09:08


In [11]:
#PROGRESS(".\nMerging\n%s"%str(datetime.now().strftime('%H:%M:%S')))
#X = Mergesamples(vcf_table)
X['REF']=X['combined_ref']
X['INFO']=X['combined_info']
X1 =X[attrib+list(vcf_table.keys())+['U.size','END']].sort_values(['#CHROM','POS'])
#PROGRESS(".\nMerged ... %s"%str(datetime.now().strftime('%H:%M:%S')))

X1

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,GTEX-PLZ4-0003,GTEX-OHPM-0003,GTEX-X4EO-0003,GTEX-UTHO-0003,GTEX-WY7C-0002,GTEX-P44H-1126,GTEX-NFK9-0004,GTEX-TMZS-0002,U.size,END
0,1,16717,STR_2,GGTGGTGGTGGGGGCGGTGGGGGTGGTG,.,.,.,PERIOD=3;START=16717;END=16744,GT:GB,./.:0|9,./.:0|9,./.:.,./.:.,./.:.,./.:0|0,./.:0|9,./.:.,3,16744
1,1,26454,STR_3,"GTGTGTGTGTGT,TAGTGTGTGTGTGT",.,.,.,PERIOD=2;START=26454;END=26465,GT:GB,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,2,26465
2,1,28589,STR_4,"GGTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTT",.,.,.,PERIOD=1;START=28589;END=28603,GT:GB,./.:.,./.:3|4,./.:-6|3,./.:.,./.:.,./.:.,./.:.,./.:.,1,28603
3,1,30863,STR_5,TCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTCTCTATCTCA...,.,.,.,PERIOD=2;START=30863;END=30959,GT:GB,./.:.,./.:0|0,./.:0|0,./.:0|-6,./.:.,./.:.,./.:.,./.:.,2,30959
4,1,31720,STR_6,AAAAAAAAAAAAAA,.,.,.,PERIOD=1;START=31720;END=31733,GT:GB,./.:0|-1,./.:0|-1,./.:0|-1,./.:0|-1,./.:0|-1,./.:0|-1,./.:0|-1,./.:0|-1,1,31733
5,1,33450,STR_7,AAAAAAAAAAAAAAA,.,.,.,PERIOD=1;START=33450;END=33464,GT:GB,./.:.,./.:.,./.:.,./.:.,./.:.,./.:0|-2,./.:.,./.:0|0,1,33464
6,1,33521,STR_8,"GTTTTTTTTTCTTTTTTTTTTT,AAAGTTTTTTTTTCTTTTTTTTTTT",.,.,.,PERIOD=1;START=33521;END=33541,GT:GB,./.:.,./.:.,./.:.,./.:.,./.:-2|0,./.:0|0,./.:0|0,./.:.,1,33541
7,1,35484,STR_9,"ATAAAAAATAAATAAATAAATAAAAAC,ATAAAAAATAAATAAATA...",.,.,.,PERIOD=4;START=35484;END=35507,GT:GB,./.:0|0,,./.:0|0,./.:0|0,./.:.,./.:0|0,./.:0|0,./.:0|0,4,35507
8,1,36352,STR_10,AAAAAAAAAAAAA,.,.,.,PERIOD=1;START=36352;END=36364,GT:GB,./.:0|0,./.:.,./.:0|0,./.:0|0,./.:.,./.:0|-1,./.:.,./.:.,1,36364
9,1,39909,STR_11,AAATAAATAAATAAA,.,.,.,PERIOD=4;START=39909;END=39923,GT:GB,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,4,39923


In [15]:
print(sorted(list(set(list(X1['#CHROM'])))))


['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'X', 'Y']


In [5]:
SAMPLE=[]
vcf_table={}
print('Begin ... ', datetime.now().strftime('%H:%M:%S'))
for VCF in vcfs: 
    vcf_reader = vcf.Reader(filename=VCF)
    print('****VCF Opened ... ', datetime.now().strftime('%H:%M:%S'))
    Pos = [record.INFO['START'] for record in vcf.Reader(filename=VCF)]
    print(len(Pos),' Start done ',datetime.now().strftime('%H:%M:%S'))
    CH = [record.CHROM for record in vcf.Reader(filename=VCF)]
    print(len(CH),' CH done',datetime.now().strftime('%H:%M:%S'))
    Ref = [record.REF for record in vcf.Reader(filename=VCF)]
    print(len(Ref),' REF done ',datetime.now().strftime('%H:%M:%S'))
    Alt = ["."]*len(CH)
    Qual= ["."]*len(CH)
    Fil = ["."]*len(CH)
    Fmt = ["GT:GB"]*len(CH)
    print('... done', datetime.now().strftime('%H:%M:%S'))
    Id = [record.ID for record in vcf.Reader(filename=VCF)]
    print(len(Id),' ID done ',datetime.now().strftime('%H:%M:%S'))
    Info = [';'.join(['PERIOD='+str(record.INFO['PERIOD']),'START='+str(record.INFO['START']),'END='+str(record.INFO['END'])]) for record in vcf.Reader(filename=VCF)] 
    print('Info done... ',datetime.now().strftime('%H:%M:%S'))
    Unit = [str(record.INFO['PERIOD']) for record in vcf.Reader(filename=VCF)]
    print('Step size done...',datetime.now().strftime('%H:%M:%S'))
    geno = [getgb(record) for record in vcf.Reader(filename=VCF) ]
    print('data in frame.. ',datetime.now().strftime('%H:%M:%S'))
    Spl = vcf.Reader(filename=VCF).samples[0]
    df = pd.DataFrame({"#CHROM":CH,"POS":Pos,"ID":Id,"REF":Ref ,"ALT":Alt,"QUAL":Qual,"FILTER":Fil, "INFO":Info,"FORMAT":Fmt, Spl:geno })
    vcf_table[Spl]=df

Begin ...  10:49:12
****VCF Opened ...  10:49:12
1543420  Start done  10:53:23
1543420  CH done 10:57:34
1543420  REF done  11:01:44
... done 11:01:44
1543420  ID done  11:05:55
Info done...  11:10:12
Step size done... 11:14:22
data in frame..  11:18:37
****VCF Opened ...  11:18:38
1555715  Start done  11:22:49
1555715  CH done 11:27:00
1555715  REF done  11:31:11
... done 11:31:11
1555715  ID done  11:35:22
Info done...  11:39:41
Step size done... 11:43:54
data in frame..  11:48:10
****VCF Opened ...  11:48:11
1558762  Start done  11:52:23
1558762  CH done 11:56:34
1558762  REF done  12:00:45
... done 12:00:45
1558762  ID done  12:04:56
Info done...  12:09:15
Step size done... 12:13:29
data in frame..  12:17:46


In [6]:
for VCF in vcfs:
    
    
    
    
# Merging all
X = Mergesamples(vcf_table)   ;  
#Merged = X[attrib+SAMPLE+['U.size','END']]

KeyError: 'U.size'

In [None]:
## FILTERING phase 1

#Remove Homopolymers
    ##Remove homopolymers
        if HOMO_POLY:
            M = removehomopolymers(M)
            PROGRESS("\tHomopolymers removed in data.. %s"%str(datetime.now().strftime('%H:%M:%S')))
        M.to_csv(OUT+C+"_Merged.tsv",sep='\t', index=None)
        X1.append(M)
#    Merged_f = pd.concat(X1) ###
    
    
#remove STRs that fall in Segmental duplication
    if SEG_DUP:
        #PROGRESS("\n Remove segmental duplication... "%str(len(X1))); PROGRESS(".%s"%str(datetime.now().strftime('%H:%M:%S')))
        Seg_dup = pd.read_csv('/storage/resources/dbase/human/hg19/hg19_segmentalduplications.bed', sep='\t', header=None)
        Seg_dup.columns = ['CHROM', 'START','END','OTHERS','INFO','STRAND']
        fragments=[]
        t=0
        for C in CHR:
            X = X1[t]#Merge_f.loc[Merge_f['#CHROM']==C]
            Y = Seg_dup.loc[Seg_dup['CHROM']=='chr'+str(C)]
            X['POS'] = X["POS"].astype(int)
            X['END'] = X["END"].astype(int)
            for i in range(len(list(Y.index))):
                start = list(Y['START'])[i]
                end = list(Y['END'])[i]
                X2 = X.loc[(X["END"]<=start) | (X["POS"]>=end)]
                X = 0; X = X2
            fragments.append(X2.sort_values('POS'))
            print(C,'\t',X.shape)
            t=t+1
        Merged_f = pd.concat(fragments)
    
    #Remove penta- and hexanucleotides with homopolymer runs 
    



In [None]:
#Removing STRs overlapping segmental duplications


In [23]:


CHR = list(set(list(Merged['#CHROM'])))
Seg_dup = pd.read_csv('/storage/resources/dbase/human/hg19/hg19_segmentalduplications.bed', sep='\t', header=None)
Seg_dup.columns = ['CHROM', 'START','END','OTHERS','INFO','STRAND']
fragments = []
for C in CHR:
    X = Merged_f#.loc[Merged_f['#CHROM']==C]
    C=1
    Y = Seg_dup.loc[Seg_dup['CHROM']=='chr'+str(C)]
    
    X['POS'] = X["POS"].astype(int)
    X['END'] = X["END"].astype(int)
    
    for i in list(Y.index):
        start = Y.iloc[i]['START']
        end = Y.iloc[i]['END']
        #print(start, end )
        X1 = X.loc[(X["END"]<=start) & (end <= X["POS"])]
        X = 0; X = X1
    fragments.append(X1)
    print(X.shape)

X

Original data  (338, 21) 
 After homopoly filter  (198, 21)  left


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(0, 21)


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,GTEX-OHPK-0003,...,GTEX-U3ZH-0003,GTEX-REY6-0004,GTEX-WK11-0002,GTEX-XOT4-0001,GTEX-XBED-0004,GTEX-S95S-0004,GTEX-WHSB-0004,GTEX-U3ZN-0001,U.size,END


In [40]:
vcf_table['GTEX-OHPK-0003'].loc[vcf_table['GTEX-OHPK-0003']['U.size']!='1']

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,GTEX-OHPK-0003,U.size,END
1,1,16717,STR_2,GGTGGTGGTGGGGGCGGTGGGGGTGGTG,.,.,.,PERIOD=3;START=16717;END=16744,GT:GB,./.:0|9,3,16744
2,1,26454,STR_3,GTGTGTGTGTGT,.,.,.,PERIOD=2;START=26454;END=26465,GT:GB,./.:0|0,2,26465
4,1,30863,STR_5,TCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTCTCTATCTCA...,.,.,.,PERIOD=2;START=30863;END=30959,GT:GB,./.:.,2,30959
8,1,35484,STR_9,ATAAAAAATAAATAAATAAATAAA,.,.,.,PERIOD=4;START=35484;END=35507,GT:GB,./.:0|0,4,35507
10,1,39909,STR_11,AAATAAATAAATAAA,.,.,.,PERIOD=4;START=39909;END=39923,GT:GB,./.:0|0,4,39923
12,1,44836,STR_13,AAATAAATAAATAAATAAATAAATAAATAAATTAAATAAAT,.,.,.,PERIOD=4;START=44836;END=44876,GT:GB,./.:0|0,4,44876
15,1,50482,STR_16,GTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT,.,.,.,PERIOD=2;START=50482;END=50513,GT:GB,./.:.,2,50513
16,1,50571,STR_17,TTTCCTTTCCTTTCCTTGCTCTTCTTTCTCTCCTATTGCTTTCCTT...,.,.,.,PERIOD=5;START=50571;END=50626,GT:GB,./.:0|0,5,50626
20,1,62232,STR_22,ACACACATACACACACACACACACA,.,.,.,PERIOD=2;START=62232;END=62256,GT:GB,./.:0|0,2,62256
26,1,82214,STR_29,ACACACACACAAACAC,.,.,.,PERIOD=2;START=82214;END=82229,GT:GB,./.:0|0,2,82229


In [38]:
M = X.sort_values('POS')
M

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,GTEX-OHPK-0003,...,GTEX-T5JC-0002,GTEX-R55D-0926,GTEX-O5YW-0003,GTEX-XXEK-0001,GTEX-P4PQ-0003,GTEX-14ICL-0004,GTEX-YF7O-0002,GTEX-148VI-0002,U.size,END
247,1,856020,STR_263,TGTGCTGTGTGTGTGT,.,.,.,PERIOD=2;START=856020;END=856035,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,2,856035
248,1,859291,STR_264,GGCGGCGGCGTCGGCGGCGG,.,.,.,PERIOD=3;START=859291;END=859310,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,3,859310
249,1,866512,STR_265,"CCCTCCCTCCCTCCC,CCCTCCCTCCCTCCCA",.,.,.,PERIOD=4;START=866512;END=866526,GT:GB,./.:0|0,...,./.:0|4,./.:0|4,./.:4|4,./.:4|4,./.:4|4,./.:.,./.:4|4,./.:0|4,4,866526
251,1,875669,STR_267,"CGCCGCCGCCCCCCCCCCCCGCCCCGCCGCGGA,CGCGCCGCCGCC...",.,.,.,PERIOD=5;START=875669;END=875696,GT:GB,./.:.,...,./.:.,./.:.,./.:.,./.:0|-2,./.:.,,./.:-2|-1,./.:0|-2,5,875696
253,1,879055,STR_269,CCCTCCCTCCCTCCC,.,.,.,PERIOD=4;START=879055;END=879069,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,4,879069
254,1,879887,STR_270,"CCAGCCCAGCCCAGCCCAGCTCTCG,CCAGCCCAGCCCAGCCCAGC",.,.,.,PERIOD=5;START=879887;END=879906,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,5,879906
255,1,886040,STR_271,CACACACACACAG,.,.,.,PERIOD=2;START=886040;END=886051,GT:GB,./.:-3|-3,...,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,2,886051
260,1,896150,STR_276,CGCCGCCGCCTCCACCGCCGCAGCCGCCG,.,.,.,PERIOD=3;START=896150;END=896178,GT:GB,,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,,./.:0|0,./.:0|0,3,896178
261,1,897456,STR_277,CCCCACCCCACCCCACCCCA,.,.,.,PERIOD=5;START=897456;END=897475,GT:GB,./.:.,...,./.:.,./.:.,./.:0|0,./.:0|0,./.:0|0,./.:.,./.:.,./.:.,5,897475
262,1,900684,STR_278,TTTATTTAGTTGTTTATTTATTTAGTTATTTATCTTATTTATTGAG...,.,.,.,PERIOD=4;START=900684;END=900726,GT:GB,./.:0|0,...,./.:0|0,./.:-4|0,./.:-4|-4,./.:-4|-4,./.:-4|0,./.:0|0,./.:-4|-4,./.:-4|0,4,900726


In [None]:
## Write VCF
Merged.to_csv("table.tab", sep='\t', index=None)
command = "grep '^##' "+vcfs[0]
vcfheader = subprocess.check_output(command, shell=True)
f=open('tmp','w')
f.write(vcfheader.decode('utf-8'))
f.close()
command = "cat tmp table.tab >"+OUTFILE 
MG = subprocess.check_output(command, shell=True)

In [None]:
"""Overlapping segmental duplications:
/storage/resources/dbase/human/hg19/hg19_segmentalduplications.bed
•Pentanucleotides with homopolymer runs >= 5bp long or 
hexanucleotides with homopolymer runs >= 6bp long:	/storage/resources/dbase/human/hg19/hg19.hipstr_reference_hrun.bed
•For GTEx analysis: remove homopolymers
"""
def loci_triage(X):
    Seg_dup = pd.read_csv('/storage/resources/dbase/human/hg19/hg19_segmentalduplications.bed', sep='\t')
    X1 = X.loc[X['U.size'] !='1'] #homopolymers out
    
    


In [None]:

    

hrun = pd.read_csv('/storage/resources/dbase/human/hg19/hg19.hipstr_reference_hrun.bed', sep='\t', header=None)
hrun.columns=['chrom', 'start', 'end', 'step', 'MaxRepeats']
hrun = hrun.loc[hrun['step']!=1]





In [26]:
Seg_dup = pd.read_csv('/storage/resources/dbase/human/hg19/hg19_segmentalduplications.bed', sep='\t', header=None)
S = Seg_dup.loc[Seg_dup[0]=='chr1']
S = S.sort_values(1)
S.loc[S[1]<990000]

Unnamed: 0,0,1,2,3,4,5
470,chr1,10000,19844,chrX:155249881,0,-
467,chr1,10000,87112,chr15:102446355,0,-
468,chr1,10000,20818,chr12:84886,0,-
469,chr1,10000,19844,chrY:59352887,0,-
471,chr1,10464,40733,chr2:114330297,0,-
472,chr1,10485,19844,chr16:60426,0,+
473,chr1,10485,40733,chr9:10843,0,+
474,chr1,18392,87112,chr19:60000,0,+
475,chr1,20863,40733,chr12:60839,0,-
476,chr1,34044,40733,chr12:147379,0,+


In [25]:
    for VCF in vcfs:
        PROGRESS("\tSample..  %s"%VCF)
        vcf_reader = vcf.Reader(filename=VCF)
        df=pd.DataFrame(index=[0], columns=attrib+[vcf_reader.samples[0],'U.size', 'END'])
        df= df.fillna(0)
        i=0
        for record in vcf_reader:
            i+=1
            ch  = record.CHROM
            pos = record.INFO['START']
            ref = record.REF
            alt = "."
            qual= "."
            fil = "."
            info= ';'.join(['PERIOD='+str(record.INFO['PERIOD']),'START='+str(record.INFO['START']),'END='+str(record.INFO['END'])])
            fmt = "GT:GB"
            spl = record.samples
            if spl[0]['GB'] is None:
                gb='.'
            else:
                gb=spl[0]['GB']
            geno=':'.join(['./.', gb])
            unit = str(record.INFO['PERIOD'])
            data = [ch,pos,record.ID,ref,alt,qual,fil,info,fmt,geno,unit,str(record.INFO['END'])]
            df.loc[i] = data
        df=df.drop([0])        
        SAMPLE.append(spl[0].sample)

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,GTEX-OHPK-0003,...,GTEX-U3ZH-0003,GTEX-REY6-0004,GTEX-WK11-0002,GTEX-XOT4-0001,GTEX-XBED-0004,GTEX-S95S-0004,GTEX-WHSB-0004,GTEX-U3ZN-0001,U.size,END
0,1,16717,STR_2,GGTGGTGGTGGGGGCGGTGGGGGTGGTG,.,.,.,PERIOD=3;START=16717;END=16744,GT:GB,./.:0|9,...,./.:.,./.:.,./.:.,./.:.,./.:0|0,./.:0|0,./.:.,./.:.,3,16744
1,1,26454,STR_3,"GTGTGTGTGTGT,TAGTGTGTGTGTGT",.,.,.,PERIOD=2;START=26454;END=26465,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,2,26465
3,1,30863,STR_5,TCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTCTCTATCTCA...,.,.,.,PERIOD=2;START=30863;END=30959,GT:GB,./.:.,...,./.:.,./.:.,./.:-13|0,./.:-15|0,./.:0|2,./.:.,./.:.,,2,30959
7,1,35484,STR_9,"ATAAAAAATAAATAAATAAATAAA,ATAAAAAATAAATAAATAAAT...",.,.,.,PERIOD=4;START=35484;END=35507,GT:GB,./.:0|0,...,./.:0|0,./.:0|4,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,,4,35507
9,1,39909,STR_11,AAATAAATAAATAAA,.,.,.,PERIOD=4;START=39909;END=39923,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,4,39923
11,1,44836,STR_13,AAATAAATAAATAAATAAATAAATAAATAAATTAAATAAAT,.,.,.,PERIOD=4;START=44836;END=44876,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,,./.:0|0,./.:0|0,./.:0|0,./.:.,,4,44876
14,1,50482,STR_16,GTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT,.,.,.,PERIOD=2;START=50482;END=50513,GT:GB,./.:.,...,./.:.,./.:0|-2,./.:0|6,./.:0|8,./.:0|0,./.:.,./.:0|0,./.:.,2,50513
15,1,50571,STR_17,TTTCCTTTCCTTTCCTTGCTCTTCTTTCTCTCCTATTGCTTTCCTT...,.,.,.,PERIOD=5;START=50571;END=50626,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,5,50626
19,1,62232,STR_22,ACACACATACACACACACACACACA,.,.,.,PERIOD=2;START=62232;END=62256,GT:GB,./.:0|0,...,./.:0|-2,./.:0|0,./.:0|0,./.:0|-2,./.:0|-2,./.:0|-2,./.:0|-2,./.:.,2,62256
25,1,82214,STR_29,ACACACACACAAACAC,.,.,.,PERIOD=2;START=82214;END=82229,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:.,./.:0|0,./.:.,./.:0|0,./.:0|0,,2,82229


In [None]:
    SAMPLE=[]
    vcf_table={}
    for VCF in vcfs: 
        vcf_reader = vcf.Reader(filename=VCF)
        df=pd.DataFrame(index=[0], columns=attrib+[vcf_reader.samples[0],'U.size', 'END'])
        df= df.fillna(0)
        i=0
        for record in vcf_reader:
            i+=1
            ch  = record.CHROM
            pos = record.INFO['START']
            ref = record.REF
            alt = "."
            qual= "."
            fil = "."
            info= ';'.join(['PERIOD='+str(record.INFO['PERIOD']),'START='+str(record.INFO['START']),'END='+str(record.INFO['END'])])
            fmt = "GT:GB"
            spl = record.samples
            if spl[0]['GB'] is None:
                gb='.'
            else:
                gb=spl[0]['GB']
            geno=':'.join(['./.', gb])
            unit = str(record.INFO['PERIOD'])
            data = [ch,pos,record.ID,ref,alt,qual,fil,info,fmt,geno,unit,str(record.INFO['END'])]
            df.loc[i] = data
            if pos>=990000 or ch==2:
                break
        df=df.drop([0])        
        SAMPLE.append(spl[0].sample)
        vcf_table[spl[0].sample]=df