In [16]:
# Install wget if needed
!apt-get install wget -y

# Create a working directory
!mkdir -p /content/annovar

# Move into that directory
%cd /content/annovar

# Download ANNOVAR using your personal link (replace with your own if needed)
!wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

# Extract it
!tar -zxvf annovar.latest.tar.gz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
/content/annovar
--2025-08-04 06:55:55--  http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
Resolving www.openbioinformatics.org (www.openbioinformatics.org)... 67.205.156.247
Connecting to www.openbioinformatics.org (www.openbioinformatics.org)|67.205.156.247|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 280727629 (268M) [application/x-gzip]
Saving to: ‘annovar.latest.tar.gz.1’


2025-08-04 06:55:59 (59.4 MB/s) - ‘annovar.latest.tar.gz.1’ saved [280727629/280727629]

annovar/
annovar/example/
annovar/example/ex1.avinput
annovar/example/example.simple_region
annovar/example/example.tab_region
annovar/example/ex2.vcf
annovar/example/grantham.matrix
annovar/example/snplist.txt
annovar/example/README
annovar/example

In [17]:
!ls annovar

annotate_variation.pl  humandb			     retrieve_seq_from_fasta.pl
coding_change.pl       input.avinput		     table_annovar.pl
convert2annovar.pl     output.hg19_multianno.csv     variants_reduction.pl
example		       output.refGene.invalid_input


In [18]:
!perl annovar/annotate_variation.pl

Usage:
     annotate_variation.pl [arguments] <query-file|table-name> <database-location>

     Optional arguments:
            -h, --help                      print help message
            -m, --man                       print complete documentation
            -v, --verbose                   use verbose output
        
            Arguments to download databases or perform annotations
                --downdb                    download annotation database
                --geneanno                  annotate variants by gene-based annotation (infer functional consequence on genes)
                --regionanno                annotate variants by region-based annotation (find overlapped regions in database)
                --filter                    annotate variants by filter-based annotation (find identical variants in database)
        
            Arguments to control input and output
                --outfile <file>            output file prefix
                --webfrom <string

In [19]:
import pandas as pd

# Step 1: Read the CSV
df = pd.read_csv("/content/coding_variants.csv")  # Change this if your filename is different
print("✅ CSV loaded successfully.")
print("📋 Columns:", df.columns.tolist())

# Step 2: Normalize column names (strip spaces and convert to uppercase)
df.columns = df.columns.str.strip().str.upper()
print("📋 Normalized Columns:", df.columns.tolist())

# Step 3: Check if required columns exist
required_cols = {'CHROM', 'POS', 'REF', 'ALT'}
if not required_cols.issubset(set(df.columns)):
    missing = required_cols - set(df.columns)
    raise ValueError(f"❌ Missing columns: {missing}")

# Step 4: Create START and END columns
df['START'] = df['POS']
df['END'] = df['POS']

# Step 5: Reorder for AVinput format
avinput_df = df[['CHROM', 'START', 'END', 'REF', 'ALT']]

# Preview the first few rows
print("\n📄 Preview of AVinput-formatted data:")
print(avinput_df.head())

# Step 6: Save to file
avinput_df.to_csv("input.avinput", sep='\t', index=False, header=False)
print("\n✅ Saved to 'input.avinput'")


✅ CSV loaded successfully.
📋 Columns: ['CHROM', 'POS', 'REF', 'ALT']
📋 Normalized Columns: ['CHROM', 'POS', 'REF', 'ALT']

📄 Preview of AVinput-formatted data:
  CHROM    START      END REF ALT
0  chr1  2488173  2488173   T   G
1  chr1  3703383  3703383   T   C
2  chr1  3852235  3852235   A   G
3  chr1  4914953  4914953   C   G
4  chr1  5707004  5707004   T   A

✅ Saved to 'input.avinput'


In [39]:
from google.colab import files
files.download('input.avinput')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
%cd annovar/


/content/annovar/annovar


In [22]:
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/


NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory


In [23]:
# ClinVar annotations (disease relevance)
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20220320 humandb/

# dbNSFP (functional predictions like SIFT, PolyPhen)
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp42a humandb/

# gnomAD population allele frequency
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/


NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20220320.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20220320.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_dbnsfp42a.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_dbnsfp42a.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checki

In [32]:
!rm -rf humandb/*
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20220320 humandb/
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp42a humandb/


NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20220320.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20220320.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished d

In [35]:
!perl table_annovar.pl input.avinput humandb/ -buildver hg38 \
 -out output -remove -protocol refGene,clinvar_20220320,dbnsfp42a \
 -operation g,f,f -nastring .


NOTICE: the --polish argument is set ON automatically (use --nopolish to change this behavior)
-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=refGene

NOTICE: Running with system command <annotate_variation.pl -geneanno -buildver hg38 -dbtype refGene -outfile output.refGene -exonsort -nofirstcodondel input.avinput humandb/>
NOTICE: Output files are written to output.refGene.variant_function, output.refGene.exonic_variant_function
NOTICE: Reading gene annotation from humandb/hg38_refGene.txt ... Done with 88819 transcripts (including 21511 without coding sequence annotation) for 28307 unique genes
NOTICE: Processing next batch with 5119 unique variants in 5119 input lines
NOTICE: Reading FASTA sequences from humandb/hg38_refGeneMrna.fa ... Done with 168 sequences
NOTICE: Variants with invalid input format are written to output.refGene.invalid_input

NOTICE: Running with system command <coding_change.pl  output.refGene.exonic_var

In [38]:
!head output.hg38_multianno.txt


Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	ExonicFunc.refGene	AAChange.refGene	CLNALLELEID	CLNDN	CLNDISDB	CLNREVSTAT	CLNSIG	SIFT_score	SIFT_converted_rankscore	SIFT_pred	SIFT4G_score	SIFT4G_converted_rankscore	SIFT4G_pred	Polyphen2_HDIV_score	Polyphen2_HDIV_rankscore	Polyphen2_HDIV_pred	Polyphen2_HVAR_score	Polyphen2_HVAR_rankscore	Polyphen2_HVAR_pred	LRT_score	LRT_converted_rankscore	LRT_pred	MutationTaster_score	MutationTaster_converted_rankscore	MutationTaster_pred	MutationAssessor_score	MutationAssessor_rankscore	MutationAssessor_pred	FATHMM_score	FATHMM_converted_rankscore	FATHMM_pred	PROVEAN_score	PROVEAN_converted_rankscore	PROVEAN_pred	VEST4_score	VEST4_rankscore	MetaSVM_score	MetaSVM_rankscore	MetaSVM_pred	MetaLR_score	MetaLR_rankscore	MetaLR_pred	MetaRNN_score	MetaRNN_rankscore	MetaRNN_pred	M-CAP_score	M-CAP_rankscore	M-CAP_pred	REVEL_score	REVEL_rankscore	MutPred_score	MutPred_rankscore	MVP_score	MVP_rankscore	MPC_score	MPC_rankscore	PrimateAI_score	P