## Predict outputs for a DNA sequence

AlphaGenome is a model that makes predictions from DNA sequences. Let's load it
up:

In [None]:
# @title Install AlphaGenome

# @markdown Run this cell to install AlphaGenome.
from IPython.display import clear_output
! pip install alphagenome
clear_output()

In [None]:
# @title Setup and imports.

from io import StringIO
from alphagenome import colab_utils
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from google.colab import data_table, files
import pandas as pd
from tqdm import tqdm

data_table.enable_dataframe_formatter()

# Load the model.
dna_model = dna_client.create(colab_utils.get_api_key())

In [None]:
# @title Score batch of variants.

# Load VCF file containing variants.
vcf_file = 'placeholder'  # @param

# We provide an example list of variants to illustrate.
vcf_file = """variant_id\tCHROM\tPOS\tREF\tALT
chr3_58394738_A_T_b38\tchr3\t58394738\tA\tT
chr8_28520_G_C_b38\tchr8\t28520\tG\tC
chr16_636337_G_A_b38\tchr16\t636337\tG\tA
chr16_1135446_G_T_b38\tchr16\t1135446\tG\tT
"""

vcf = pd.read_csv(StringIO(vcf_file), sep='\t')

required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
  if column not in vcf.columns:
    raise ValueError(f'VCF file is missing required column: {column}.')

organism = 'human'  # @param ["human", "mouse"] {type:"string"}

# @markdown Specify length of sequence around variants to predict:
sequence_length = '1MB'  # @param ["16KB", "100KB", "500KB", "1MB"] { type:"string" }
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{sequence_length}'
]
#评分目标不同：不同的分子生物学活性
# @markdown Specify which scorers to use to score your variants:
score_rna_seq = True  # @param { type: "boolean"}
score_cage = True  # @param { type: "boolean" }
score_procap = True  # @param { type: "boolean" }
score_atac = True  # @param { type: "boolean" }
score_dnase = True  # @param { type: "boolean" }
score_chip_histone = True  # @param { type: "boolean" }
score_chip_tf = True  # @param { type: "boolean" }
score_polyadenylation = True  # @param { type: "boolean" }
score_splice_sites = True  # @param { type: "boolean" }
score_splice_site_usage = True  # @param { type: "boolean" }
score_splice_junctions = True  # @param { type: "boolean" }

# @markdown Other settings:
download_predictions = True  # @param { type: "boolean" }

# Parse organism specification.
organism_map = {
    'human': dna_client.Organism.HOMO_SAPIENS,
    'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[organism]

# Parse scorer specification.
scorer_selections = {
    'rna_seq': score_rna_seq,
    'cage': score_cage,
    'procap': score_procap,
    'atac': score_atac,
    'dnase': score_dnase,
    'chip_histone': score_chip_histone,
    'chip_tf': score_chip_tf,
    'polyadenylation': score_polyadenylation,
    'splice_sites': score_splice_sites,
    'splice_site_usage': score_splice_site_usage,
    'splice_junctions': score_splice_junctions,
}

all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]

# Remove any scorers or output types that are not supported for the chosen organism.
unsupported_scorers = [
    scorer
    for scorer in selected_scorers
    if (
        organism.value
        not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
    )
    | (
        (scorer.requested_output == dna_client.OutputType.PROCAP)
        & (organism == dna_client.Organism.MUS_MUSCULUS)
    )
]
if len(unsupported_scorers) > 0:
  print(
      f'Excluding {unsupported_scorers} scorers as they are not supported for'
      f' {organism}.'
  )
  for unsupported_scorer in unsupported_scorers:
    selected_scorers.remove(unsupported_scorer)


# Score variants in the VCF file.
results = []

for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
  variant = genome.Variant(
      chromosome=str(vcf_row.CHROM),
      position=int(vcf_row.POS),
      reference_bases=vcf_row.REF,
      alternate_bases=vcf_row.ALT,
      name=vcf_row.variant_id,
  )
  interval = variant.reference_interval.resize(sequence_length)

  variant_scores = dna_model.score_variant(
      interval=interval,
      variant=variant,
      variant_scorers=selected_scorers,
      organism=organism,
  )
  results.append(variant_scores)

df_scores = variant_scorers.tidy_scores(results)

if download_predictions:
  df_scores.to_csv('variant_scores.csv', index=False)
  files.download('variant_scores.csv')

df_scores

100%|██████████| 4/4 [00:05<00:00,  1.32s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,...,biosample_type,biosample_life_stage,data_source,endedness,genetically_modified,transcription_factor,histone_mark,gtex_tissue,raw_score,quantile_score
0,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",...,primary_cell,adult,encode,paired,False,,,,-0.004951,-0.259272
1,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",...,in_vitro_differentiated_cells,adult,encode,paired,False,,,,0.015554,0.343112
2,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",...,primary_cell,adult,encode,paired,False,,,,-0.001731,-0.137554
3,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",...,primary_cell,adult,encode,paired,False,,,,-0.009523,-0.449849
4,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",...,primary_cell,adult,encode,paired,False,,,,-0.009647,-0.421820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118281,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292423,ENSG00000292423,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,tissue,adult,gtex,paired,False,,,Skin_Not_Sun_Exposed_Suprapubic,-0.000437,-0.312224
118282,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292430,ENSG00000292430,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,tissue,adult,gtex,paired,False,,,Skin_Not_Sun_Exposed_Suprapubic,-0.001305,-0.641886
118283,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292431,ENSG00000292431,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,tissue,adult,gtex,paired,False,,,Skin_Not_Sun_Exposed_Suprapubic,0.000015,0.023068
118284,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292432,ENSG00000292432,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,tissue,adult,gtex,paired,False,,,Skin_Not_Sun_Exposed_Suprapubic,-0.000606,-0.402670




Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,...,biosample_type,biosample_life_stage,data_source,endedness,genetically_modified,transcription_factor,histone_mark,gtex_tissue,raw_score,quantile_score
0,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",...,primary_cell,adult,encode,paired,False,,,,-0.004951,-0.259272
168,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,DNASE,"CenterMaskScorer(requested_output=DNASE, width...",...,primary_cell,adult,encode,paired,False,,,,-0.030459,-0.680793
2109,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,CHIP_HISTONE,CenterMaskScorer(requested_output=CHIP_HISTONE...,...,primary_cell,adult,encode,single,False,,H3K27ac,,-0.000978,-0.301775
2110,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,CHIP_HISTONE,CenterMaskScorer(requested_output=CHIP_HISTONE...,...,primary_cell,adult,encode,single,False,,H3K27me3,,0.000441,0.080577
2111,chr3:58394738:A>T,chr3:57870450-58919026:.,,,,,,,CHIP_HISTONE,CenterMaskScorer(requested_output=CHIP_HISTONE...,...,primary_cell,adult,encode,single,False,,H3K36me3,,0.000000,-0.092027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91363,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000290756,ENSG00000290756,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,primary_cell,adult,encode,single,False,,,,-0.000391,-0.402670
91364,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292400,ENSG00000292400,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,primary_cell,adult,encode,single,False,,,,0.000173,0.280661
91365,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292423,ENSG00000292423,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,primary_cell,adult,encode,single,False,,,,-0.001031,-0.641886
91366,chr16:1135446:G>T,chr16:611158-1659734:.,ENSG00000292431,ENSG00000292431,lncRNA,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,primary_cell,adult,encode,single,False,,,,0.000020,0.103452
