In [13]:
import pandas as pd

# File path
vep_file = "/home/subhashree/Project/CTX-Bioinformatics-Intern-Assignment/test1_data_1_vep.tsv"

# Read VEP TSV file and skip '##' metadata lines
df_raw = pd.read_csv(
    vep_file,
    sep="\t",
    comment='##',  # skip metadata lines
    engine='python'
)

# Remove leading '#' from header if present
df_raw.rename(columns=lambda x: x.lstrip('#'), inplace=True)

# Save cleaned dataframe starting from '#Uploaded_variation'
df = df_raw.copy()
df.head()


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
0,1_871334_G/T,1:871334,T,ENSG00000234711,ENST00000415481,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,1_871334_G/T,1:871334,T,ENSG00000230368,ENST00000427857,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,1_871334_G/T,1:871334,T,ENSG00000230368,ENST00000432963,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,1_871334_G/T,1:871334,T,ENSG00000230368,ENST00000446136,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,1_871334_G/T,1:871334,T,ENSG00000283040,ENST00000635557,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [14]:
# Total variants
total_variants = len(df)
print("Total variants:", total_variants)

# Unique genes
unique_genes = df['Gene'].nunique()
print("Unique genes:", unique_genes)


Total variants: 228055
Unique genes: 14674


In [15]:
# Extract traits from PHENO column
phenos = df['PHENO'].dropna().astype(str)
unique_traits = set()
for p in phenos:
    for trait in p.replace(';', ',').split(','):
        trait = trait.strip()
        if trait:
            unique_traits.add(trait)

num_unique_traits = len(unique_traits)
print("Unique traits/disease conditions:", num_unique_traits)


Unique traits/disease conditions: 3


In [16]:
df["CLIN_SIG"].value_counts()

-                                                                    226521
benign                                                                 1078
uncertain_significance                                                   98
likely_benign,benign                                                     98
likely_benign                                                            89
benign,likely_benign                                                     61
not_provided,benign                                                      54
benign,not_provided                                                      27
benign/likely_benign,benign                                              16
likely_benign,conflicting_interpretations_of_pathogenicity,benign        13
Name: CLIN_SIG, dtype: int64

In [17]:
df_patho = df[df["CLIN_SIG"].str.contains("pathogenic", case=False, na=False)]
df_patho

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
116551,11_534242_A/G,11:534242,G,ENSG00000161328,ENST00000270115,Transcript,upstream_gene_variant,-,-,-,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116552,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000311189,Transcript,synonymous_variant,295,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116553,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000397594,Transcript,synonymous_variant,134,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116554,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000397596,Transcript,synonymous_variant,217,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116555,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000417302,Transcript,synonymous_variant,295,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116556,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000451590,Transcript,synonymous_variant,269,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116557,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000462734,Transcript,"synonymous_variant,NMD_transcript_variant",1120,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116558,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000468682,Transcript,synonymous_variant,569,81,27,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116559,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000479482,Transcript,upstream_gene_variant,-,-,-,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-
116560,11_534242_A/G,11:534242,G,ENSG00000174775,ENST00000482021,Transcript,downstream_gene_variant,-,-,-,...,gnomADg_AFR,"likely_benign,conflicting_interpretations_of_p...",11,111,"25741868,24033266,33390813,23757202,20569235,2...",-,-,-,-,-


In [18]:
df_likely_patho = df[df["CLIN_SIG"].str.contains("likely_pathogenic", case=False, na=False)]
df_likely_patho

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS


In [19]:
print("Variants with pathogenicity:", df_patho["Uploaded_variation"].nunique())

Variants with pathogenicity: 1


In [20]:
print("Variants with likely pathogenicity:", df_likely_patho["Uploaded_variation"].nunique())

Variants with likely pathogenicity: 0


In [21]:
#No. of variants per chromosome
df[['chr', 'position']] = df['Location'].str.split(':', expand=True)
df["chr"].value_counts()

1     22067
19    18625
11    18059
17    15733
2     14747
3     13761
12    11830
6     11408
4     10548
16    10547
5     10295
7      9355
10     9150
14     8390
9      8352
15     7696
8      6101
22     5717
20     4435
13     3821
18     2926
21     2450
X      2040
Y         2
Name: chr, dtype: int64

In [22]:
# Identify rows that do NOT start with 'rs'
no_rs = df[~df['Uploaded_variation'].str.startswith('rs', na=False)]

# Optional: rows with rsIDs
with_rs = df[df['Uploaded_variation'].str.startswith('rs', na=False)]

# Print or save
print("Variants without rsIDs:")
no_rs["Uploaded_variation"].nunique()


Variants without rsIDs:


30726

In [23]:
print("Variants with rsIDs:")
with_rs["Uploaded_variation"].nunique()

Variants with rsIDs:


0