In [5]:
import pandas as pd

# 1. Convert the raw 23andMe data to vcf

We will remove all SNPs corresponding to deletions and insertions, to make the file compatible with annotation tools

`../plink_mac_20201019/plink --23file data/SNP_raw_v4_Full_20170514175358.txt --recode vcf --out snps_clean --output-chr MT --snps-only just-acgt`

We interested only clinically relevant SNPs, so we will intersect result with vcf from ClinVar database (https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz).

Download and unzip:

`wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz`

`gzip -d clinvar.vcf.gz`

Intersect:

`bedtools intersect -a snps_clean.vcf -b clinvar.vcf  > intersected.vcf`

# 2. Annotation

We will use VEP (Variant Effect Predictor) online version (http://grch37.ensembl.org/Homo_sapiens/Tools/VEP).

Input:

<img src="img/vep.png">

Result in `data/vep_result.txt`. Now look at result:

In [7]:
df = pd.read_csv('data/vep_result.txt', sep='\t')
df.head()

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,...,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
0,rs2843159,1:2235672-2235672,T,intron_variant,MODIFIER,SKI,ENSG00000157933,Transcript,ENST00000378536.4,protein_coding,...,0.3498,-,-,-,1632788422984993,-,-,-,-,-
1,rs2843159,1:2235672-2235672,T,downstream_gene_variant,MODIFIER,SKI,ENSG00000157933,Transcript,ENST00000478223.2,processed_transcript,...,0.3498,-,-,-,1632788422984993,-,-,-,-,-
2,rs2843159,1:2235672-2235672,T,"intron_variant,non_coding_transcript_variant",MODIFIER,SKI,ENSG00000157933,Transcript,ENST00000507179.1,retained_intron,...,0.3498,-,-,-,1632788422984993,-,-,-,-,-
3,rs2843159,1:2235672-2235672,T,downstream_gene_variant,MODIFIER,SKI,ENSG00000157933,Transcript,ENST00000508416.1,processed_transcript,...,0.3498,-,-,-,1632788422984993,-,-,-,-,-
4,rs2234167,1:2494330-2494330,G,missense_variant,MODERATE,TNFRSF14,ENSG00000157873,Transcript,ENST00000355716.4,protein_coding,...,-,-,-,-,-,-,-,-,-,-


Now we will look at CLIN_SIG and try to find pathogenic and risk_factor SNPs

In [13]:
clin_sigs = set()

for c in df.CLIN_SIG.unique():
    c_a = c.split(',')
    clin_sigs.update(c_a)

clin_sigs

{'-',
 '_other',
 '_risk_factor',
 'affects',
 'association',
 'association_not_found',
 'benign',
 'benign/likely_benign',
 'conflicting_interpretations_of_pathogenicity',
 'drug_response',
 'likely_benign',
 'likely_pathogenic',
 'not_provided',
 'other',
 'pathogenic',
 'protective',
 'risk_factor',
 'uncertain_significance'}

We will look at pathogenic tag

In [54]:
df[df.SOMATIC != '-']

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,...,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
70,rs1135172,1:11301714-11301714,G,synonymous_variant,LOW,MTOR,ENSG00000198793,Transcript,ENST00000361445.4,protein_coding,...,0.6396,benign,01,11,3048774828977864,-,-,-,-,-
71,rs1135172,1:11301714-11301714,G,regulatory_region_variant,MODIFIER,-,-,RegulatoryFeature,ENSR00001492400,promoter_flanking_region,...,0.6396,benign,01,11,3048774828977864,-,-,-,-,-
72,rs1135172,1:11301714-11301714,G,synonymous_variant,LOW,MTOR,ENSG00000198793,Transcript,ENST00000361445.4,protein_coding,...,0.6396,benign,01,11,3048774828977864,-,-,-,-,-
73,rs1135172,1:11301714-11301714,G,regulatory_region_variant,MODIFIER,-,-,RegulatoryFeature,ENSR00001492400,promoter_flanking_region,...,0.6396,benign,01,11,3048774828977864,-,-,-,-,-
209,rs6687605,1:25889632-25889632,T,missense_variant,MODERATE,LDLRAP1,ENSG00000157978,Transcript,ENST00000374338.4,protein_coding,...,-,-,1,1,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18772,rs1183768,9:135203231-135203231,T,missense_variant,MODERATE,SETX,ENSG00000107290,Transcript,ENST00000224140.5,protein_coding,...,0.5561,benign,01,11,25741868240332662375720225382069,-,-,-,-,-
18773,rs1183768,9:135203231-135203231,T,missense_variant,MODERATE,SETX,ENSG00000107290,Transcript,ENST00000372169.2,protein_coding,...,0.5561,benign,01,11,25741868240332662375720225382069,-,-,-,-,-
18774,rs1183768,9:135203231-135203231,T,missense_variant,MODERATE,SETX,ENSG00000107290,Transcript,ENST00000393220.1,protein_coding,...,0.5561,benign,01,11,25741868240332662375720225382069,-,-,-,-,-
18872,rs2229971,9:139407932-139407932,G,synonymous_variant,LOW,NOTCH1,ENSG00000148400,Transcript,ENST00000277541.6,protein_coding,...,0.5278,"likely_benign,benign",01,11,16729972,-,-,-,-,-


In [62]:
search_for = ['pathogenic']
# df[df['CLIN_SIG'].str.contains('|'.join(search_for))]['#Uploaded_variation'].unique()
pathogenic_df = df[df['CLIN_SIG'].str.contains('|'.join(search_for))].groupby('#Uploaded_variation').first()
pathogenic_df = pathogenic_df[['Location', 'Allele', 'Codons', 'CLIN_SIG']]
pathogenic_df.CLIN_SIG

#Uploaded_variation
i5005436                                             pathogenic
i5006568      likely_benign,benign/likely_benign,pathogenic,...
i6015290      uncertain_significance,likely_benign,conflicti...
i6015729                 pathogenic,benign/likely_benign,benign
i6058764                                  protective,pathogenic
i6060296      conflicting_interpretations_of_pathogenicity,b...
rs10151259         not_provided,pathogenic,likely_benign,benign
rs1024611                                pathogenic,risk_factor
rs1042503                              likely_pathogenic,benign
rs11085825                             benign,likely_pathogenic
rs11558492                      pathogenic,likely_benign,benign
rs16879498                                    benign,pathogenic
rs1800435                              likely_benign,pathogenic
rs2004640                                risk_factor,pathogenic
rs2301612                                     pathogenic,benign
rs2306283     confli

We will find this SNPs in dbSNP https://www.ncbi.nlm.nih.gov/snp/

In [63]:
pathogenic_df

Unnamed: 0_level_0,Location,Allele,Codons,CLIN_SIG
#Uploaded_variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
i5005436,6:32008312-32008312,T,-,pathogenic
i5006568,2:71829924-71829924,G,Atc/Gtc,"likely_benign,benign/likely_benign,pathogenic,..."
i6015290,14:23887607-23887607,T,aaC/aaA,"uncertain_significance,likely_benign,conflicti..."
i6015729,1:156848918-156848918,T,Cat/Tat,"pathogenic,benign/likely_benign,benign"
i6058764,16:27356203-27356203,G,Atc/Gtc,"protective,pathogenic"
i6060296,19:13010520-13010520,G,-,"conflicting_interpretations_of_pathogenicity,b..."
rs10151259,14:21790040-21790040,T,Gct/Tct,"not_provided,pathogenic,likely_benign,benign"
rs1024611,17:32579788-32579788,G,-,"pathogenic,risk_factor"
rs1042503,12:103246700-103246700,T,gtG/gtA,"likely_pathogenic,benign"
rs11085825,19:13007458-13007458,T,-,"benign,likely_pathogenic"


In [64]:
search_for = ['risk_factor']
# df[df['CLIN_SIG'].str.contains('|'.join(search_for))]['#Uploaded_variation'].unique()
pathogenic_df = df[df['CLIN_SIG'].str.contains('|'.join(search_for))].groupby('#Uploaded_variation').first()
pathogenic_df = pathogenic_df[['Location', 'Allele', 'Codons', 'CLIN_SIG']]
pathogenic_df.CLIN_SIG

#Uploaded_variation
i3000469                                            risk_factor
i6007787                                     risk_factor,benign
i6058143                       drug_response,benign,risk_factor
i6059141                       likely_benign,benign,risk_factor
rs1024611                                pathogenic,risk_factor
rs1049296                        risk_factor,association,benign
rs1169288                                    benign,risk_factor
rs12150220                                          risk_factor
rs13266634                                          risk_factor
rs1801197                                           risk_factor
rs1801274                      drug_response,benign,risk_factor
rs1801275                                           risk_factor
rs1801394     uncertain_significance,benign,drug_response,ri...
rs1801968                      likely_benign,benign,risk_factor
rs2004640                                risk_factor,pathogenic
rs2073658           

In [65]:
pathogenic_df

Unnamed: 0_level_0,Location,Allele,Codons,CLIN_SIG
#Uploaded_variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
i3000469,2:138759649-138759649,T,aCa/aTa,risk_factor
i6007787,2:234183368-234183368,G,Act/Gct,"risk_factor,benign"
i6058143,1:161479745-161479745,G,cAt/cGt,"drug_response,benign,risk_factor"
i6059141,8:133909974-133909974,G,Atg/Gtg,"likely_benign,benign,risk_factor"
rs1024611,17:32579788-32579788,G,-,"pathogenic,risk_factor"
rs1049296,3:133494354-133494354,T,Cct/Tct,"risk_factor,association,benign"
rs1169288,12:121416650-121416650,C,Atc/Ctc,"benign,risk_factor"
rs12150220,17:5485367-5485367,T,cTc/cAc,risk_factor
rs13266634,8:118184783-118184783,T,Cgg/Tgg,risk_factor
rs1801197,7:93055753-93055753,G,cTg/cCg,risk_factor
