In [1]:
import pandas as pd

# File path
vep_file = "/home/subhashree/Project/CTX-Bioinformatics-Intern-Assignment/test2_data_1_vep.tsv"

# Read VEP TSV file and skip '##' metadata lines
df_raw = pd.read_csv(
    vep_file,
    sep="\t",
    comment='##',  # skip metadata lines
    engine='python'
)

# Remove leading '#' from header if present
df_raw.rename(columns=lambda x: x.lstrip('#'), inplace=True)

# Save cleaned dataframe starting from '#Uploaded_variation'
df = df_raw.copy()
df.head()


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
0,rs3087742,MT:73,G,ENSG00000198888,ENST00000361390,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,34002094354537883799872537881602,-,-,-,-,-
1,rs3087742,MT:73,G,ENSG00000198763,ENST00000361453,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,34002094354537883799872537881602,-,-,-,-,-
2,rs3087742,MT:73,G,ENSG00000209082,ENST00000386347,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,34002094354537883799872537881602,-,-,-,-,-
3,rs3087742,MT:73,G,ENSG00000210049,ENST00000387314,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,34002094354537883799872537881602,-,-,-,-,-
4,rs3087742,MT:73,G,ENSG00000210077,ENST00000387342,Transcript,upstream_gene_variant,-,-,-,...,-,-,-,-,34002094354537883799872537881602,-,-,-,-,-


In [2]:
# Total variants
total_variants = len(df)
print("Total variants:", total_variants)

# Unique genes
unique_genes = df['Gene'].nunique()
print("Unique genes:", unique_genes)


Total variants: 4447820
Unique genes: 67066


In [4]:
# Extract traits from PHENO column
phenos = df['PHENO'].dropna().astype(str)
unique_traits = set()
for p in phenos:
    for trait in p.replace(';', ',').split(','):
        trait = trait.strip()
        if trait:
            unique_traits.add(trait)

num_unique_traits = len(unique_traits)
print("Unique traits/disease conditions:", num_unique_traits)


Unique traits/disease conditions: 3


In [5]:
df["CLIN_SIG"].value_counts()

-                                                                                             4433710
benign                                                                                           5510
likely_benign                                                                                    1601
uncertain_significance                                                                           1534
pathogenic                                                                                        915
                                                                                               ...   
pathogenic,pathogenic/likely_pathogenic,not_provided                                                8
likely_benign,uncertain_significance,conflicting_interpretations_of_pathogenicity                   8
risk_factor,affects,pathogenic,pathogenic/likely_pathogenic,likely_pathogenic,not_provided          2
not_provided,likely_benign                                                        

In [6]:
df_patho = df[df["CLIN_SIG"].str.contains("pathogenic", case=False, na=False)]
df_patho

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
283,rs387906735,MT:608,G,ENSG00000198888,ENST00000361390,Transcript,upstream_gene_variant,-,-,-,...,-,"uncertain_significance,pathogenic",-,1,11231339,-,-,-,-,-
284,rs387906735,MT:608,G,ENSG00000198763,ENST00000361453,Transcript,upstream_gene_variant,-,-,-,...,-,"uncertain_significance,pathogenic",-,1,11231339,-,-,-,-,-
285,rs387906735,MT:608,G,ENSG00000209082,ENST00000386347,Transcript,upstream_gene_variant,-,-,-,...,-,"uncertain_significance,pathogenic",-,1,11231339,-,-,-,-,-
286,rs387906735,MT:608,G,ENSG00000210049,ENST00000387314,Transcript,non_coding_transcript_exon_variant,32,-,-,...,-,"uncertain_significance,pathogenic",-,1,11231339,-,-,-,-,-
287,rs387906735,MT:608,G,ENSG00000210077,ENST00000387342,Transcript,upstream_gene_variant,-,-,-,...,-,"uncertain_significance,pathogenic",-,1,11231339,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4301254,rs1555050,22:50627253,T,ENSG00000289244,ENST00000806467,Transcript,upstream_gene_variant,-,-,-,...,"gnomADe_AFR,gnomADe_AMR,gnomADe_ASJ,gnomADe_EA...",pathogenic,01,11,-,-,-,-,-,-
4301255,rs1555050,22:50627253,T,ENSG00000289244,ENST00000806468,Transcript,upstream_gene_variant,-,-,-,...,"gnomADe_AFR,gnomADe_AMR,gnomADe_ASJ,gnomADe_EA...",pathogenic,01,11,-,-,-,-,-,-
4301256,rs1555050,22:50627253,T,ENSG00000289244,ENST00000806469,Transcript,upstream_gene_variant,-,-,-,...,"gnomADe_AFR,gnomADe_AMR,gnomADe_ASJ,gnomADe_EA...",pathogenic,01,11,-,-,-,-,-,-
4301257,rs1555050,22:50627253,T,ENSG00000289244,ENST00000806470,Transcript,upstream_gene_variant,-,-,-,...,"gnomADe_AFR,gnomADe_AMR,gnomADe_ASJ,gnomADe_EA...",pathogenic,01,11,-,-,-,-,-,-


In [7]:
df_likely_patho = df[df["CLIN_SIG"].str.contains("likely_pathogenic", case=False, na=False)]
df_likely_patho

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
2501,rs199474657,MT:3243,G,ENSG00000198888,ENST00000361390,Transcript,upstream_gene_variant,-,-,-,...,-,"likely_pathogenic,pathogenic,not_provided,path...",-,1,"25741868,33785019,16326995,26076356,20610441,2...",-,-,-,-,-
2502,rs199474657,MT:3243,G,ENSG00000198763,ENST00000361453,Transcript,upstream_gene_variant,-,-,-,...,-,"likely_pathogenic,pathogenic,not_provided,path...",-,1,"25741868,33785019,16326995,26076356,20610441,2...",-,-,-,-,-
2503,rs199474657,MT:3243,G,ENSG00000198804,ENST00000361624,Transcript,upstream_gene_variant,-,-,-,...,-,"likely_pathogenic,pathogenic,not_provided,path...",-,1,"25741868,33785019,16326995,26076356,20610441,2...",-,-,-,-,-
2504,rs199474657,MT:3243,G,ENSG00000198712,ENST00000361739,Transcript,upstream_gene_variant,-,-,-,...,-,"likely_pathogenic,pathogenic,not_provided,path...",-,1,"25741868,33785019,16326995,26076356,20610441,2...",-,-,-,-,-
2505,rs199474657,MT:3243,G,ENSG00000209082,ENST00000386347,Transcript,non_coding_transcript_exon_variant,14,-,-,...,-,"likely_pathogenic,pathogenic,not_provided,path...",-,1,"25741868,33785019,16326995,26076356,20610441,2...",-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3941093,rs886037859,19:1220449,G,ENSG00000118046,ENST00000714323,Transcript,missense_variant,1669,541,181,...,-,likely_pathogenic,0111,1111,25741868,-,-,-,-,-
3948139,rs121434498,19:4117551,C,ENSG00000126934,ENST00000262948,Transcript,missense_variant,418,171,57,...,gnomADe_ASJ,likely_pathogenic,01111,11111,2661901118456719164396211804226219156172,-,-,-,-,-
3948140,rs121434498,19:4117551,C,ENSG00000126934,ENST00000394867,Transcript,non_coding_transcript_exon_variant,610,-,-,...,gnomADe_ASJ,likely_pathogenic,01111,11111,2661901118456719164396211804226219156172,-,-,-,-,-
3948141,rs121434498,19:4117551,C,ENSG00000126934,ENST00000599345,Transcript,non_coding_transcript_exon_variant,368,-,-,...,gnomADe_ASJ,likely_pathogenic,01111,11111,2661901118456719164396211804226219156172,-,-,-,-,-


In [None]:
print("Variants with pathogenicity:", df_patho["Uploaded_variation"].nunique())

Variants with pathogenicity: 196


In [8]:
print("Variants with likely pathogenicity:", df_likely_patho["Uploaded_variation"].nunique())

Variants with likely pathogenicity: 73


In [1]:
#No. of variants per chromosome
df[['chr', 'position']] = df['Location'].str.split(':', expand=True)
df["chr"].value_counts()

NameError: name 'df' is not defined

In [9]:
# Identify rows that do NOT start with 'rs'
no_rs = df[~df['Uploaded_variation'].str.startswith('rs', na=False)]

# Optional: rows with rsIDs
with_rs = df[df['Uploaded_variation'].str.startswith('rs', na=False)]

# Print or save
print("Variants without rsIDs:")
no_rs["Uploaded_variation"].nunique()


: 

In [None]:
print("Variants with rsIDs:")
with_rs["Uploaded_variation"].nunique()