In [1]:
# STEP 1: Install ANNOVAR
# ===============================
!wget -O annovar.tar.gz 'https://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz'
!tar -xzvf annovar.tar.gz
!rm annovar.tar.gz

--2025-08-10 15:20:41--  https://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
Resolving www.openbioinformatics.org (www.openbioinformatics.org)... 67.205.156.247
Connecting to www.openbioinformatics.org (www.openbioinformatics.org)|67.205.156.247|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 280727629 (268M) [application/x-gzip]
Saving to: ‘annovar.tar.gz’


2025-08-10 15:21:02 (13.7 MB/s) - ‘annovar.tar.gz’ saved [280727629/280727629]

annovar/
annovar/example/
annovar/example/ex1.avinput
annovar/example/example.simple_region
annovar/example/example.tab_region
annovar/example/ex2.vcf
annovar/example/grantham.matrix
annovar/example/snplist.txt
annovar/example/README
annovar/example/gene_xref.txt
annovar/example/gene_fullxref.txt
annovar/humandb/
annovar/humandb/hg19_refGene.txt
annovar/humandb/hg19_refGeneMrna.fa
annovar/humandb/hg19_refGeneVersion.txt
annovar/humandb/hg19_refGeneWithVer.txt
annovar/humandb/hg19_refGeneWithVer

In [2]:
# STEP 2: Download required databases for hg38
# ===============================
%cd /content/annovar

# Core gene annotation
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/

# ClinVar
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20221231 humandb/

# dbSNP
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp150 humandb/

# gnomAD genomes
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/

# dbNSFP functional prediction
!perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp42a humandb/


/content/annovar
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20221231.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20221231.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
N

In [3]:
from google.colab import files

uploaded = files.upload()

import pandas as pd
import os

uploaded_file = list(uploaded.keys())[0]
print(f"Uploaded: {uploaded_file}")

Saving SRR766039_raw.csv to SRR766039_raw.csv
Uploaded: SRR766039_raw.csv


In [15]:
import pandas as pd

# Load CSV
df = pd.read_csv("/content/SRR766039_raw.csv")

# Ensure chromosome names match hg38 format (with 'chr' prefix)
df['#CHROM'] = df['#CHROM'].astype(str)
df['#CHROM'] = df['#CHROM'].apply(lambda x: x if x.startswith("chr") else "chr" + x)

# For AVINPUT: Start, End calculation
start_list, end_list = [], []
for ref, alt, pos in zip(df['REF'], df['ALT'], df['POS']):
    if len(ref) == 1 and len(alt) == 1:
        start, end = pos, pos
    else:
        start, end = pos, pos + len(ref) - 1
    start_list.append(start)
    end_list.append(end)

# Create avinput DataFrame
df_avinput = pd.DataFrame({
    'Chr': df['#CHROM'],
    'Start': start_list,
    'End': end_list,
    'Ref': df['REF'],
    'Alt': df['ALT']
})

# Save
avinput_path = "/content/input.avinput"
df_avinput.to_csv(avinput_path, sep="\t", index=False, header=False)
print(f"✅ Saved AVINPUT to {avinput_path}")


✅ Saved AVINPUT to /content/input.avinput


In [16]:
!cd /content/annovar

!perl table_annovar.pl /content/input.avinput humandb/ \
   -buildver hg38 \
   -out output_annotation \
   -remove \
   -protocol refGene,clinvar_20221231,avsnp150,gnomad211_genome \
   -operation g,f,f,f \
   -nastring .


NOTICE: the --polish argument is set ON automatically (use --nopolish to change this behavior)
-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=refGene

NOTICE: Running with system command <annotate_variation.pl -geneanno -buildver hg38 -dbtype refGene -outfile output_annotation.refGene -exonsort -nofirstcodondel /content/input.avinput humandb/>
NOTICE: Output files are written to output_annotation.refGene.variant_function, output_annotation.refGene.exonic_variant_function
NOTICE: Reading gene annotation from humandb/hg38_refGene.txt ... Done with 88819 transcripts (including 21511 without coding sequence annotation) for 28307 unique genes
NOTICE: Processing next batch with 586939 unique variants in 586939 input lines
NOTICE: Reading FASTA sequences from humandb/hg38_refGeneMrna.fa ... Done with 26838 sequences
NOTICE: Variants with invalid input format are written to output_annotation.refGene.invalid_input

NOTICE: Running with 

In [17]:
!ls -lh /content/annovar/output_annotation.hg38_multianno.txt
!head /content/annovar/output_annotation.hg38_multianno.txt


-rw-r--r-- 1 root root 100M Aug 10 16:38 /content/annovar/output_annotation.hg38_multianno.txt
Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	ExonicFunc.refGene	AAChange.refGene	CLNALLELEID	CLNDN	CLNDISDB	CLNREVSTAT	CLNSIG	avsnp150	AF	AF_popmax	AF_male	AF_female	AF_raw	AF_afr	AF_sas	AF_amr	AF_eas	AF_nfe	AF_fin	AF_asj	AF_oth	non_topmed_AF_popmax	non_neuro_AF_popmax	non_cancer_AF_popmax	controls_AF_popmax
chr1	10150	10150	C	T	intergenic	NONE,DDX11L1	dist=NONE;dist=1724	.	.	.	.	.	.	.	rs371194064	0.0044	0.0041	0.0036	0.0056	0.0005	0	.	0	0	0.0041	0.0098	0	0.025	0.0030	0.0052	.	0.0068
chr1	10321	10321	C	T	intergenic	NONE,DDX11L1	dist=NONE;dist=1553	.	.	.	.	.	.	.	rs1002315756	0.0010	0.0015	0.0014	0.0006	0.0017	0.0015	.	0	0	0.0011	0	0	0	0.0015	0.0012	.	0.0015
chr1	14653	14653	C	T	ncRNA_exonic	WASH7P	.	.	.	.	.	.	.	.	rs62635297	0.1989	0.2740	0.1971	0.2010	0.2703	0.1490	.	0.176	0.2740	0.2496	0.1144	0.2256	0.1618	0.2739	0.2740	.	0.2786
chr1	14677	14677	G	A	ncRNA_exonic	WASH7P	.

In [18]:
import pandas as pd

# Read the tab-delimited annotation file
df = pd.read_csv('/content/annovar/output_annotation.hg38_multianno.txt', sep='\t')

# Save as CSV
csv_path = '/content/output_annotation.csv'
df.to_csv(csv_path, index=False)

# Download the CSV to your local machine
from google.colab import files
files.download(csv_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>