In [26]:
!mkdir -p /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck

In [30]:
import pandas as pd
import glob

def fasta_to_df(file,file_name,out_dir):
    myDict = {}
    with open(file, 'rU') as fileData:
        tag = None
        for line in fileData:
            line = line.strip()
            if not line:
                continue
            if line[0] == '>':
                tag = line[1:]
                myDict[tag] = ''
            else:
                assert tag is not None, 'Invalid format, found gene without tag'
                myDict[tag] += line
    
    ### create df from the dict and wrangle it to get columns in the order:
    ### 'snp_id','context','ref','count','coordinates','position','ancestry','method','1000G','sequence'
    
    df = pd.DataFrame.from_dict(myDict, orient='index').reset_index()
    df = df.rename(columns={'index':'header',0:'sequence'})
    df[['snp_id','context','ref','count','coordinates','position','ancestry','method','1000G']] = df.header.str.split('/', expand=True)
    for name in ['context','ref','count','coordinates','position','ancestry','method','1000G']:
        df[name]=df[name].apply(lambda x: x.split('=')[1])
    
    df['rsID'] = df['snp_id'].apply(lambda x: x.split(':')[0])
    df['seq_allele'] = df['snp_id'].apply(lambda x: x.split(':')[1])
    df = df[['rsID','context','ref','seq_allele','count','coordinates','position','ancestry','method','1000G','sequence']]
#     df.to_csv('%s/%s.txt'%(out_dir,file_name), sep='\t', index=False)
    return df

fasta_files = [f for f in glob.glob('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/*.fasta')]

for file in fasta_files:
    name = file.split('/')[-1].split('-')[0]
    out_dir = '/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck'
    fasta_to_df(file,name,out_dir)


  


In [40]:
file='/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/phase1-new.fasta'
name='phase1'

phase1_df = fasta_to_df(file, name, out_dir)
phase1_df_sorted = phase1_df.sort_values(ascending=True, by='rsID')
phase1_df_sorted.reset_index(inplace=True)
phase1_df_sorted[['sequence']].drop_duplicates()

  


Unnamed: 0,sequence
0,CCTGGTTTGAGATGCTGAGGTGGGAGGATCCCTTGAGGCCAGGAGT...
1,CCTGGTTTGAGATGCTGAGGTGGGAGGATCCCTTGAGGCCAGGAGT...
2,CCTGGTTTGAGATGCTGAGGTGGGAGGATCCCTTGAGGCCAGGAGT...
3,CAACATGTCAAAACTCTGTCTCTACCAAAAATACAAAAAATTACCT...
4,CAACATGTCAAAACTCTGTCTCTACCAAAAATACAAAAAATTACCT...
5,CAACATGTCAAAACTCTGTCTCTACCAAAAATACAAAAAATTACCT...
6,CATGAACCCACCGGAAGGAAGAAACTCTGAACACATCCAAACGTCA...
7,CACGAACCCACCGGAAGGAAGAAACTCTGAACACATCCAAACATCA...
8,CCTGCAAGAGCCAGGTTTGGCGAATGCAGTGATAAGCAATTGTCTA...
9,CCTGCAAGAGCCAGGTTTGGCGAATGCAATGATAAGCAATTGTCTA...


In [41]:
phase1_df_sorted.shape

(4924, 12)

In [10]:
import pandas as pd
import numpy as np
import re


def check_allele(row):
    pos_start = row['position']
    pos_end = row['position']+row['allele_len']
    if row['phase']=='phase1':
        return row['sequence'][pos_start:pos_end]
    if row['phase']=='phase2':
        return row['sequence'][pos_start:pos_end]
    if row['phase']=='phase3':
        return row['sequence'][pos_start:pos_end]

def trim_sequence(row):
    if row['diff']>0:
        n = row['diff']
        if ((row['phase']=='phase1') and (row['position']<=81)):
            return row['sequence'][:-n]
        elif ((row['phase']=='phase2') and (row['position']<=122)):
            return row['sequence'][:-n]
        elif ((row['phase']=='phase3') and (row['position']<=163)):
            return row['sequence'][:-n]
        else:
            return row['sequence'][n:]
    else:
        return row['sequence']
    

phase1 = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/phase1.txt', sep='\t')
phase2 = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/phase2.txt', sep='\t')
phase3 = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/phase3.txt', sep='\t')


phase1['phase'] = 'phase1'
phase2['phase'] = 'phase2'
phase3['phase'] = 'phase3'


phase_combined = pd.concat([phase1, phase2, phase3], ignore_index=True)
phase_combined['sequence'] = phase_combined['sequence'].str.replace(' ','')  
phase_combined[['chr','start','end']] = phase_combined['coordinates'].str.split(':|-', expand=True)
phase_combined['allele_len']=phase_combined['end'].astype(int)-phase_combined['start'].astype(int)+1
phase_combined['diff'] = phase_combined['sequence'].apply(len)-246
phase_combined['check_allele'] = phase_combined.apply(check_allele, axis=1)
phase_combined.rename(columns={'ref':'ref_allele'}, inplace=True)
phase_combined['ref_allele'] = phase_combined['ref_allele'].str.replace(' ','')
phase_combined['seq_allele'] = phase_combined['seq_allele'].str.replace(' ','')

phase_combined['allele_type'] = np.where(phase_combined['ref_allele']==phase_combined['seq_allele'],'ref','alt')
phase_combined['new_sequence'] = phase_combined.apply(trim_sequence, axis=1)

# phase_combined_sub = phase_combined[['rsID','chr','start','end','position','ref','seq_allele','check_allele','method','phase','sequence','allele_len','diff']]
# phase_combined_sub_dup['seq_len'] = phase_combined_sub_dup['sequence'].apply(len)

phase_combined['ref/alt'] = phase_combined['ref_allele']+'/'+phase_combined['seq_allele']

phase_combined['5prime_adapter'] = 'TTTCCCTACACGACGCTCTTCCGATCT'
phase_combined['3prime_adapter'] = 'AGATCGGAAGAGCACACGTCTGAACTC'

phase_combined['final_sequence'] = phase_combined['5prime_adapter'].astype(str)+phase_combined['new_sequence'].astype(str)+phase_combined['3prime_adapter'].astype(str)
phase_combined['final_seq_len'] = phase_combined['final_sequence'].apply(len)
phase_combined['identifier'] = phase_combined['rsID']+':'+phase_combined['allele_type']+':'+phase_combined['ref_allele']+':'+phase_combined['seq_allele']+':'+phase_combined['phase']+':'+phase_combined['method']
phase_combined['allele_match'] = np.where(phase_combined['seq_allele']==phase_combined['check_allele'],'True','False')
# phase_combined['rsID_phase'] = phase_combined['rsID']+'_'+phase_combined['phase']
phase_combined.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary_corrected.full_table.without_neg_controls.txt', sep='\t',index=False)

# phase_combined_dup = phase_combined[phase_combined_sub['final_sequence'].duplicated(keep=False)]

In [None]:
def check_allele(row):
    pos_start = row['position']
    pos_end = row['position']+row['allele_len']
    if row['phase']=='phase1':
        return row['sequence'][pos_start:pos_end]
    if row['phase']=='phase2':
        return row['sequence'][pos_start:pos_end]
    if row['phase']=='phase3':
        return row['sequence'][pos_start:pos_end]

df['check_allele'] = df.apply(check_allele, axis=1)
df['allele_match'] = np.where(df['seq_allele']==df['check_allele'],'True','False')

In [139]:
phase_combined_sub.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary_corrected.without_neg_controls.txt', sep='\t',index=False)

In [122]:
phase_combined_dup[phase_combined_dup['allele_type']=='alt']['rsID'].unique()

array(['rs9270921', 'rs2187892', 'rs61270113', 'rs3823544', 'rs9270913',
       'rs9271560', 'rs545401578', 'rs2345571', 'rs9275369', 'rs7198606',
       'rs657769', 'rs223463', 'rs9271153', 'rs223462', 'rs5028523',
       'rs9275371', 'rs35898555', 'rs9271156', 'rs2345572', 'rs544452',
       'rs9271162', 'rs9886296', 'rs11864750', 'rs9270924', 'rs11748158',
       'rs9271568'], dtype=object)

In [137]:
phase_combined[(phase_combined['rsID']=='rs535777')&(phase_combined['method']=='haplotype ')]

Unnamed: 0,rsID,context,ref_allele,seq_allele,count,coordinates,position,ancestry,method,1000G,...,check_allele,allele_type,new_sequence,ref/alt,5prime_adapter,3prime_adapter,final_sequence,final_seq_len,identifier,allele_match
2169,rs535777,CTCAGTTCAG,G,G,219,chr6:32609856-32609856,79,EUR,haplotype,yes,...,G,ref,AAGGAATACATCTCTTTTCAGAGGCAGTACCCACAGACAGTTCTCA...,G/G,TTTCCCTACACGACGCTCTTCCGATCT,AGATCGGAAGAGCACACGTCTGAACTC,TTTCCCTACACGACGCTCTTCCGATCTAAGGAATACATCTCTTTTC...,297,rs535777:ref:G:G:phase1:haplotype,True
3218,rs535777,CTCAGTTCAG,G,C,104,chr6:32609856-32609856,81,EUR,haplotype,yes,...,C,alt,AAGGAATACATCTCTCTTTTCAGAGGCAGTACCCACAGACAGTTCT...,G/C,TTTCCCTACACGACGCTCTTCCGATCT,AGATCGGAAGAGCACACGTCTGAACTC,TTTCCCTACACGACGCTCTTCCGATCTAAGGAATACATCTCTCTTT...,299,rs535777:alt:G:C:phase1:haplotype,True
7872,rs535777,CTCAGTTCAG,G,C,105,chr6:32609856-32609856,122,EUR,haplotype,yes,...,C,alt,GAAATATCACTTGATAAAGAGAGAAAGATGAGTCGTTTTAGAAGGA...,G/C,TTTCCCTACACGACGCTCTTCCGATCT,AGATCGGAAGAGCACACGTCTGAACTC,TTTCCCTACACGACGCTCTTCCGATCTGAAATATCACTTGATAAAG...,299,rs535777:alt:G:C:phase2:haplotype,True
8052,rs535777,CTCAGTTCAG,G,G,219,chr6:32609856-32609856,120,EUR,haplotype,yes,...,G,ref,GAAATATCACTTGATAAAGAGAGAAAGATGAGTCGTTTTAGAAGGA...,G/G,TTTCCCTACACGACGCTCTTCCGATCT,AGATCGGAAGAGCACACGTCTGAACTC,TTTCCCTACACGACGCTCTTCCGATCTGAAATATCACTTGATAAAG...,297,rs535777:ref:G:G:phase2:haplotype,True
10289,rs535777,CTCAGTTCAG,G,C,105,chr6:32609856-32609856,163,EUR,haplotype,yes,...,C,alt,GTTGGATTTACTGAGCACTAATATTCTCATAATCATGCTAGGAAAT...,G/C,TTTCCCTACACGACGCTCTTCCGATCT,AGATCGGAAGAGCACACGTCTGAACTC,TTTCCCTACACGACGCTCTTCCGATCTGTTGGATTTACTGAGCACT...,299,rs535777:alt:G:C:phase3:haplotype,True
10830,rs535777,CTCAGTTCAG,G,G,219,chr6:32609856-32609856,161,EUR,haplotype,yes,...,G,ref,GTCGGATTTACTGAGCACTAATATTCTTATAATCATGCTAGGAAAT...,G/G,TTTCCCTACACGACGCTCTTCCGATCT,AGATCGGAAGAGCACACGTCTGAACTC,TTTCCCTACACGACGCTCTTCCGATCTGTCGGATTTACTGAGCACT...,297,rs535777:ref:G:G:phase3:haplotype,True


In [18]:
if phase_combined_sub_dup['sequence'][7501]==phase_combined_sub_dup['sequence'][11227]:
    print('True')

True


In [48]:
phase_combined_sub_dup_v1 = phase_combined_sub[phase_combined_sub['sequence'].duplicated(keep='first')]
len(phase_combined_sub_dup_v1.groupby(['rsID']).size().reset_index())

17

In [16]:
phase_combined_sub.sequence.drop_duplicates().shape

(14748,)

In [28]:
phase_combined['sequence'].apply(len).max()

275

In [30]:
275-246

29

### Check the final variant sequences sent to Maria

 - Total number of sequences: 17336

 - Total number of negative controls: 570 (3.2%)

 - Total number of library sequences present: 16766 (96.71%)

 - Total number of unique variants: 2277 (excluding negative controls)

#### Duplication:
 - 1084 out of the 16766 sequences (6.4%) are duplicate copies which maps to a total of 2135 sequence-ID pairs (12.73%)

##### Reference Allele Duplication: 
 - From 2135 sequences, 2083 sequences have the allele as ‘Reference’ - which 12.42% of all library sequences and 97.56% of all duplications. 
 - Out of the 2083 sequences (422 unique rsIDs), for 2065 sequences (99.13%) the reference sequence was repeated for both the methods – ‘haplotype’ and ‘substitution’, for the rest 18 sequences (mapped to 3 unique rsIDs - rs112008378, rs2392944, rs73923215), there are 2 different reference alleles – one is a single nucleotide allele while the second one is either a 2 nucleotide or 3 nucleotide allele (but the reference sequence remains the same)

##### Alternate Allele Duplication:
 - From 2135 sequences, 52 sequences have the allele as ‘Alternate’ – which is 0.31% of all library sequences and 2.44% of all duplications. 
 - All these 52 sequences (28 unique rsIDs) were duplicated in the phased information tables from Bill’s analysis. Upon closely examining the variants with duplicated sequences, these were haplotypes where phase2 or phase3 sequence of 1st variant was same as the phase1 sequence of the next variant. 





In [98]:
(52/16766)*100


0.31015149707741857

In [2]:
import pandas as pd
import sys

variant_seq = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/Ciofani_STARRseqVarLib.txt', sep='\t')
variant_seq.rename(columns={'phase1':'phase'}, inplace=True)
variant_seq['len'] = variant_seq['final_sequence'].apply(len)
variant_seq['ref_allele'] = variant_seq['ref_allele'].str.replace(' ','')


def check_allele(row):
    if row['phase']=='phase1':
        return row['final_sequence'][81+27]
    if row['phase']=='phase2':
        return row['final_sequence'][122+27]
    if row['phase']=='phase3':
        return row['final_sequence'][163+27]
    
            
variant_seq['allele_check'] = variant_seq.apply(check_allele, axis=1)

variant_seq_duplicated = variant_seq[variant_seq.duplicated(subset=['final_sequence'],keep=False)]

### variant sequences that are duplicated with allele type as alternate allele
dup_seq_rsID = variant_seq_duplicated[variant_seq_duplicated['allele_type']=='alt']['rsID'].unique().tolist()
len(dup_seq_rsID)

28

In [71]:
## Checking the duplicate sequences with 'ref allele' 
## Checking if there are sequences that are repeated not because of the method

grp_table_ref_variant = variant_seq_duplicated[variant_seq_duplicated['allele_type']=='ref'].groupby(['rsID','phase','method']).size().reset_index()
grp_table_ref_variant.rename(columns={0:'count'}, inplace=True)
print(grp_table_ref_variant['count'].unique())

[1 2]


In [92]:
## identifying variants with the count=2 - those that are repeated outside of the method issue

ref_variant_exception_list = list(grp_table_ref_variant[grp_table_ref_variant['count']==2]['rsID'].unique())

## pulling out the sequences of the exceptional ref variants from the table

variant_seq_duplicated_excep = variant_seq_duplicated[variant_seq_duplicated['rsID'].isin(ref_variant_exception_list)]
variant_seq_duplicated_excep[variant_seq_duplicated_excep['method']=='haplotype '].shape

(18, 9)

In [5]:
## identify ref_allele rsID-phase pairs that are duplicated

variant_seq_duplicated_ref = variant_seq_duplicated[variant_seq_duplicated['allele_type']=='ref'].sort_values(by='rsID')
variant_seq_duplicated_ref[['rsID','phase']].drop_duplicates()


Unnamed: 0,rsID,phase
15395,rs1003521,phase1
11373,rs10051765,phase3
1526,rs10051765,phase1
6449,rs10051765,phase2
16576,rs10103637,phase3
15242,rs10103637,phase1
7964,rs10103637,phase2
16577,rs10103640,phase3
15243,rs10103640,phase1
15900,rs10103640,phase2


In [152]:
## counting the variants with allele type as 'alt'

variant_seq_duplicated[variant_seq_duplicated['allele_type']=='alt'].groupby(['rsID','phase']).size().reset_index().sort_values(by=['rsID'])

Unnamed: 0,rsID,phase,0
0,rs11748158,phase3,1
1,rs11864750,phase3,1
2,rs2187892,phase1,1
3,rs2187892,phase2,1
4,rs223462,phase2,1
5,rs223462,phase3,1
6,rs223463,phase1,1
7,rs223463,phase2,1
8,rs2345571,phase1,1
9,rs2345571,phase2,1


In [168]:
dup_seq_rsID_ref = variant_seq_duplicated[variant_seq_duplicated['allele_type']=='ref']['rsID'].unique().tolist()
len(dup_seq_rsID_ref)

422

In [105]:
variant_seq_duplicated[variant_seq_duplicated['allele_type']=='alt']

(52, 9)

In [173]:
phase_combined_dup_rsID = phase_combined_sub_dup['rsID'].unique().tolist()
len(phase_combined_dup_rsID)

33

In [101]:
list_one = dup_seq_rsID
list_two = phase_combined_dup_rsID

one_not_two = set(list_one).difference(list_two)
two_not_one = set(list_two).difference(list_one)

print(one_not_two)
print(two_not_one)

set()
{'rs34965214', 'rs9272443', 'rs1129735', 'rs9272435', 'rs28383460'}


In [171]:
list_one = dup_seq_rsID_ref
list_two = phase_combined_dup_rsID

one_not_two = set(list_one).difference(list_two)
two_not_one = set(list_two).difference(list_one)

len(one_not_two)


389

#### Creating the new table with modified identifiers

 - Tag 'ref' sequences that duplicated by methods in the identifier
 - Annotate in the identifier if there are multiple reference alleles

In [96]:
import pandas as pd

phase_table = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary_corrected.full_table.without_neg_controls.txt', sep='\t')
phase_table['rsID_phase'] = phase_table['rsID']+'_'+phase_table['allele_type']+'_'+phase_table['phase']
phase_table_ref = phase_table[phase_table['allele_type']=='ref']
phase_table_alt = phase_table[phase_table['allele_type']=='alt']

phase_table_ref_sub = phase_table_ref[['identifier','final_sequence','rsID_phase']]
phase_table_alt_sub = phase_table_alt[['identifier','final_sequence','rsID_phase']]

variant_table = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/Ciofani_STARRseqVarLib.txt', sep='\t')
variant_table.rename(columns={'phase1':'phase'}, inplace=True)
variant_table['ref_allele'] = variant_table['ref_allele'].str.replace(' ','')
variant_table['identifier'] = variant_table['rsID']+':'+variant_table['allele_type']+':'+variant_table['ref_allele']+':'+variant_table['ref/alt']+':'+variant_table['phase']+':'+variant_table['method']
variant_table['rsID_phase'] = variant_table['rsID']+'_'+variant_table['allele_type']+'_'+variant_table['phase']
variant_table_ref = variant_table[variant_table['allele_type']=='ref']
variant_table_ref_dup_list = variant_table_ref[variant_table_ref['final_sequence'].duplicated(keep=False)]['rsID_phase'].unique()

phase_table_variant_table = phase_table_ref_sub[phase_table_ref_sub['rsID_phase'].isin(variant_table_ref_dup_list)]
phase_table_notin_variant_table = phase_table_ref_sub[~(phase_table_ref_sub['rsID_phase'].isin(variant_table_ref_dup_list))]


phase_table_variant_table_v1 = phase_table_variant_table[phase_table_variant_table['identifier'].str.contains('haplotype')]
phase_table_variant_table_v2 = phase_table_variant_table[phase_table_variant_table['identifier'].str.contains('substitution')]


phase_table_variant_table_v1['identifier'] = phase_table_variant_table_v1['identifier'].replace({'haplotype': 'haplotype/substitution'}, regex=True)
phase_table_variant_table_v2['identifier'] = phase_table_variant_table_v2['identifier'].replace({'substitution': 'haplotype/substitution'}, regex=True)



phase_table_variant_table_new = pd.concat([phase_table_variant_table_v1,phase_table_variant_table_v2,phase_table_notin_variant_table,phase_table_alt_sub])
phase_table_variant_table_new[['identifier','final_sequence']].to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary.without_negative_controls.corrected_identifiers.txt', sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [89]:
phase_table_variant_table_new.loc[phase_table_variant_table_new['identifier'].str.contains('rs112008378'), ['identifier']].replace({'ref:C:C':'ref:C,CG:C'}, regex=False)


Unnamed: 0,identifier
1990,rs112008378:ref:C:C:phase1:haplotype/substitut...
7242,rs112008378:ref:C:C:phase2:haplotype/substitut...
12881,rs112008378:ref:C:C:phase3:haplotype/substitut...
974,rs112008378:alt:C:A:phase1:haplotype
1909,rs112008378:alt:C:A:phase1:substitution
3340,rs112008378:alt:CG:C:phase1:haplotype
6403,rs112008378:alt:C:A:phase2:haplotype
7576,rs112008378:alt:CG:C:phase2:haplotype
9709,rs112008378:alt:C:A:phase2:substitution
9958,rs112008378:alt:C:A:phase3:haplotype


In [None]:
rsID = ['rs112008378/CG', 'rs2392944/CT', 'rs73923215/AGC']

In [101]:
!ls /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary.without_negative_controls.corrected_identifiers.v1.txt

/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary.without_negative_controls.corrected_identifiers.v1.txt


In [100]:
seq = 'TTTCCCTACACGACGCTCTTCCGATCTTGGGAGCTGGGGTTTCCCCGCCTCGGCAACAGCCTTCTCCTTGTCTTCCACCTCCTGCCATTCGCAGCGAGGACTCGCGCGAGCGCGCGCGCGCACACACACACACACACACACACACACGCACACACACACACACACACACTTTGGAATCACCCGGACCCCCTCAAAACGGGTCTCGGCTCCCCACCCCGCTGGCCGCTGGGTGTCGCTAGAGTCTAGCCCACGGCGCGTATTCCTGGCCCCGCAGATCGGAAGAGCACACGTCTGAACTC'
seq[81+27:84+27]

'AGC'

In [25]:
len(variant_table_ref[variant_table_ref['final_sequence'].duplicated(keep=False)]['rsID_phase'].unique())

1051

In [5]:
phase_table_ref_sub[phase_table_ref_sub['identifier'].str.contains("substitution")].sort_values(by='rsID_phase')

Unnamed: 0,identifier,final_sequence,rsID_phase
189,rs1048173:ref:A:A:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTCAAGTCTCCCGTGACACTG...,rs1048173_phase1
7144,rs1048173:ref:A:A:phase2:substitution,TTTCCCTACACGACGCTCTTCCGATCTTGTTTCCTCACCACAGAGG...,rs1048173_phase2
10438,rs1048173:ref:A:A:phase3:substitution,TTTCCCTACACGACGCTCTTCCGATCTCAGAACTTCAGGGCAGAGC...,rs1048173_phase3
140,rs1050414:ref:C:C:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTGTCAGCCTGTGCCTGGCGC...,rs1050414_phase1
5722,rs1050414:ref:C:C:phase2:substitution,TTTCCCTACACGACGCTCTTCCGATCTCTCTGGTTGTAGTAGCCGC...,rs1050414_phase2
12787,rs1050414:ref:C:C:phase3:substitution,TTTCCCTACACGACGCTCTTCCGATCTTCGTGACCTGCGCCCCGGG...,rs1050414_phase3
4708,rs10575904:ref:ATTAATTAAT:ATTAATTAAT:phase1:su...,TTTCCCTACACGACGCTCTTCCGATCTGAGGCTGTAGTAAGCCATG...,rs10575904_phase1
8981,rs10575904:ref:ATTAATTAAT:ATTAATTAAT:phase2:su...,TTTCCCTACACGACGCTCTTCCGATCTCTACTCAGGAGGCTGAGAT...,rs10575904_phase2
12321,rs10575904:ref:ATTAATTAAT:ATTAATTAAT:phase3:su...,TTTCCCTACACGACGCTCTTCCGATCTTTTAGTTAGACAAGCAAGA...,rs10575904_phase3
685,rs10582618:ref:AGAG:AGAG:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTCTAGGGCATGGGCAGCATG...,rs10582618_phase1


In [6]:
1051/2

525.5

In [30]:
variant_table_ref = variant_table[variant_table['allele_type']=='ref']
variant_table_ref_dup_list = variant_table_ref[variant_table_ref['final_sequence'].duplicated(keep=False)]['rsID_phase'].unique()


In [15]:
phase_table_ref_sub.merge(variant_table_ref_sub, on=['rsID_phase','final_sequence'], how='inner')

Unnamed: 0,identifier_x,final_sequence,rsID_phase,identifier_y
0,rs4934730:ref:A:A:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTCAGAGGTGAGACGCTCGCG...,rs4934730_phase1,rs4934730:ref:A:G/A:phase1:haplotype
1,rs4934730:ref:A:A:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTCAGAGGTGAGACGCTCGCG...,rs4934730_phase1,rs4934730:ref:A:G/A:phase1:substitution
2,rs2288480:ref:C:C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGTGTGGGCGTGGCCAGGGA...,rs2288480_phase1,rs2288480:ref:C:G/C:phase1:haplotype
3,rs2288480:ref:C:C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGTGTGGGCGTGGCCAGGGA...,rs2288480_phase1,rs2288480:ref:C:G/C:phase1:substitution
4,rs2872516:ref:C:C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTTGCCTCAGCCTCCTGAGT...,rs2872516_phase1,rs2872516:ref:C:C/T:phase1:haplotype
5,rs139549144:ref:C:C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTCTGCGCGGCCCGTCGCCT...,rs139549144_phase1,rs139549144:ref:C:G/C:phase1:haplotype
6,rs2736347:ref:G:G:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTAGGAGCAGATCAAAGTGAA...,rs2736347_phase1,rs2736347:ref:G:G/A:phase1:haplotype
7,rs34698021:ref:CCCC:CCCC:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTAAGAAAAGTAGGTGACCAG...,rs34698021_phase1,rs34698021:ref:CCCC:CCCC/CCC:phase1:substitution
8,rs658676:ref:C:C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTCTCTCAGAAAAAAAAAAAA...,rs658676_phase1,rs658676:ref:C:C/T:phase1:haplotype
9,rs73243351:ref:G:G:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTCAGAAGTGAGACTTCTGAA...,rs73243351_phase1,rs73243351:ref:G:G/A:phase1:haplotype


In [14]:
variant_table_ref_sub

Unnamed: 0,identifier,final_sequence,rsID_phase
1,rs61766198:ref:C:C/T:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGGCGGCGGGCCGGCACTTT...,rs61766198_phase1
3,rs2477678:ref:T:A/T:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTGCTTGGTGCCTAGCGCCA...,rs2477678_phase1
4,rs1974044:ref:A:G/A:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTCCTGGGATGTTGCTTAGG...,rs1974044_phase1
6,rs2227312:ref:C:A/C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTTTACCCTGTTCAGCAGAA...,rs2227312_phase1
10,rs2227313:ref:T:T/C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTGAGTGTGTGAGTGAGGTG...,rs2227313_phase1
11,rs4870:ref:A:G/A:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGGGTTCCCGAGCTGCCGGT...,rs4870_phase1
13,rs1886730:ref:T:C/T:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTCCCCGGCAGCCCTGACCT...,rs1886730_phase1
15,rs60733400:ref:G:G/A:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTTCTCCTGCACAGCCTGAGA...,rs60733400_phase1
18,rs9970196:ref:T:T/C:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTCATCTCTGACCCAGTTTTT...,rs9970196_phase1
20,rs4648665:ref:A:A/G:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTGTGAGCATCTGACAGACTG...,rs4648665_phase1


#### Check the oligos not recovered 

Check whether the oligos not recovered from Alex's alignments had "-" in their sequences. To check compare it with the variant tables. 

In [39]:
import pandas as pd

unrecovered_oligos = pd.read_csv('/data/reddylab/Alex/collab/20240325_Deb/data/starr_seq/metadata/Ciofani_STARRseqVarLib_combined.corrected.oligos_not_recovered.txt', sep='\t', names=['identifier'])
unrecovered_oligos_list = list(unrecovered_oligos['identifier'])

variant_library = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/library/recheck/VariantLibrary.without_negative_controls.corrected_identifiers.v1.txt', sep='\t')
variant_library['identifier'] = variant_library['identifier'].str.replace(' ','')

unrecovered_variant_library = variant_library[variant_library['identifier'].isin(unrecovered_oligos_list)]
unrecovered_variant_library[unrecovered_variant_library['final_sequence'].str.contains("N")]


Unnamed: 0,identifier,final_sequence
1829,rs2993833:ref:G:G:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGCCGGGGCGCGGGCGCTCC...
4873,rs2993833:ref:G:G:phase2:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGCTGGTGCCCTCTGAAGCG...
8641,rs2993833:alt:G:A:phase1:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGCCGGGGCGCGGGCGCTCC...
9725,rs2993833:alt:G:A:phase2:haplotype,TTTCCCTACACGACGCTCTTCCGATCTGCTGGTGCCCTCTGAAGCG...


In [41]:
variant_library_sequence_del = variant_library[variant_library['final_sequence'].str.contains("-")]

In [42]:
unrecovered_variant_library.merge(variant_library_sequence_del, on=['identifier'], how='inner')

Unnamed: 0,identifier,final_sequence_x,final_sequence_y


In [44]:
variant_library_sequence_del['new_seq'] = variant_library_sequence_del['final_sequence'].str.replace('-','')
variant_library_sequence_del['new_seq_len'] = variant_library_sequence_del['new_seq'].apply(len)
variant_library_sequence_del['final_seq_len'] = variant_library_sequence_del['final_sequence'].apply(len)
variant_library_sequence_del

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,identifier,final_sequence,new_seq,new_seq_len,final_seq_len
6946,rs77442041:alt:A:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTCTGCAGGAAGGAGGGTTCT...,TTTCCCTACACGACGCTCTTCCGATCTCTGCAGGAAGGAGGGTTCT...,298,299
7125,rs34455012:alt:CCAGTAT:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTGCTTCCTCTCAGTCAGCAC...,TTTCCCTACACGACGCTCTTCCGATCTGCTTCCTCTCAGTCAGCAC...,292,293
7433,rs150694166:alt:CA:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTATTTAAAACTAACAGCCAA...,TTTCCCTACACGACGCTCTTCCGATCTATTTAAAACTAACAGCCAA...,297,298
7699,rs147899174:alt:T:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTTCAGTTTTTCCTATGGTGA...,TTTCCCTACACGACGCTCTTCCGATCTTCAGTTTTTCCTATGGTGA...,298,299
8069,rs35593987:alt:C:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTCGCCTCTCCCAGGGAAGGT...,TTTCCCTACACGACGCTCTTCCGATCTCGCCTCTCCCAGGGAAGGT...,298,299
8156,rs57740008:alt:A:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTTGGAGAACTGGTGGGCTAC...,TTTCCCTACACGACGCTCTTCCGATCTTGGAGAACTGGTGGGCTAC...,298,299
8260,rs71822640:alt:GG:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTCCTGTGTGGGATGCAAAAA...,TTTCCCTACACGACGCTCTTCCGATCTCCTGTGTGGGATGCAAAAA...,296,297
8423,rs140670679:alt:AG:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTTACTAGGCTCAGGAAATAG...,TTTCCCTACACGACGCTCTTCCGATCTTACTAGGCTCAGGAAATAG...,297,298
8650,rs3834458:alt:T:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTCTGCCTTCCCCTCCCTTCC...,TTTCCCTACACGACGCTCTTCCGATCTCTGCCTTCCCCTCCCTTCC...,298,299
8839,rs34561244:alt:C:-:phase1:substitution,TTTCCCTACACGACGCTCTTCCGATCTGTATAGAGTATCAGTAAAG...,TTTCCCTACACGACGCTCTTCCGATCTGTATAGAGTATCAGTAAAG...,298,299


In [46]:
unrecovered_oligos[unrecovered_oligos['identifier'].str.contains('rs2993833')]

Unnamed: 0,identifier
8,rs2993833:ref:G:G:phase2:haplotype
14,rs2993833:ref:G:G:phase1:haplotype
19,rs2993833:alt:G:A:phase2:haplotype
37,rs2993833:alt:G:A:phase1:haplotype
