# Construction of variant dictionary
Author: Sonja Aits

In [None]:
import pandas as pd
import json
import requests

In [None]:
# Extract Pango terms from lineages file

# URL of the CSV file
url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/lineages.csv"

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open("lineages.csv", "wb") as file:
        file.write(response.content)
    print("Download complete: lineages.csv")
else:
    print("Failed to download file")

# Load CSV into a Pandas DataFrame
df = pd.read_csv("lineages.csv")
print("CSV file loaded into DataFrame")

# Display the first few rows
df.head()

Download complete: lineages.csv
CSV file loaded into DataFrame


Unnamed: 0,taxon,lineage
0,Belgium/UZA-UA-48355442/2023,XBZ
1,Norway/Ahus-4881/2023,XBZ
2,England/PHEP-YYG8X8X/2023,XBZ
3,Germany/RP-RKI-I-1082931/2022,XBZ
4,Germany/BY-RKI-I-1083943/2022,XBZ


In [None]:
# Extract unique values from the 'lineage' column
unique_lineages = df['lineage'].unique()
print("Unique lineages extracted")
print(len(unique_lineages))

# Extract unique values from the 'taxon' column
unique_taxa = df['taxon'].unique()
print("Unique taxa extracted")
print(len(unique_taxa))

Unique lineages extracted
4651
Unique taxa extracted
2607130


In [None]:
# Import Pango terms from alias file

# URL of the JSON file
url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/pango_designation/alias_key.json"

# Fetch the JSON content
response = requests.get(url)
data = response.json()

# Initialize an empty set to store the terms
terms = set()

# Add all keys and values (handling multiple values)
for key, value in data.items():
    terms.add(key)
    if isinstance(value, list):
        terms.update(value)  # Add all elements in the list if value is a list
    else:
        terms.add(value)  # Add the single value if it's not a list

# Print the collected terms
print("Terms extracted")
print(len(terms))

Terms extracted
927


In [None]:
# Combine lineage terms and terms from json file
unique_terms = list(set(unique_lineages).union(terms))
print(f"Length of unique terms: {len(unique_terms)}")

# Generate lexical variants and track how many are added
lexical_variants = []
for lineage in unique_terms:
    variants = [
        f"variant {lineage}",
        f"lineage {lineage}",
        f"{lineage} variant",
        f"{lineage} lineage"
    ]
    lexical_variants.extend(variants)

# Verify the number of variants generated
print(f"Total lexical variants: {len(lexical_variants)}")


Length of unique terms: 5190
Total lexical variants: 20760
All terms combined
2633468
  terms
0   XBZ
1     A
2   A.1
3  A.11
4  A.12


In [None]:
# Extract nextstrain clade terms

# URL of the TSV file
url = "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv"

# Download and load the TSV file into a pandas DataFrame
df = pd.read_csv(url, sep='\t')

# Extract unique clade names from the 'clade' column
unique_clades = df['clade'].unique()

# Print the unique clades
print("Unique clades:")
print(unique_clades)
print(f"Total unique clades: {len(unique_clades)}")

# Generate lexical clade variants and track how many are added
clade_variants = []
for term in unique_clades:
    variants = [
        f"clade {term}",
        f"{term} clade"]
    clade_variants.extend(variants)
print(clade_variants)

Unique clades:
['19A' '19B' '20A' '20B' '20C' '20D' '20E' '20F' '20G' '20H' '20I' '20J'
 '21A' '21B' '21C' '21D' '21E' '21F' '21G' '21H' '21I' '21J' '21K' '21L'
 '21M' '22A' '22B' '22C' '22D' '22E' '22F' '23A' '23B' '23C' '23D' '23E'
 '23F' '23G' '23H' '23I' '24A' '24B' '24C' '24D' '24E' '24F' '24G' '24H'
 '24I' '25A']
Total unique clades: 50
['clade 19A', '19A clade', 'clade 19B', '19B clade', 'clade 20A', '20A clade', 'clade 20B', '20B clade', 'clade 20C', '20C clade', 'clade 20D', '20D clade', 'clade 20E', '20E clade', 'clade 20F', '20F clade', 'clade 20G', '20G clade', 'clade 20H', '20H clade', 'clade 20I', '20I clade', 'clade 20J', '20J clade', 'clade 21A', '21A clade', 'clade 21B', '21B clade', 'clade 21C', '21C clade', 'clade 21D', '21D clade', 'clade 21E', '21E clade', 'clade 21F', '21F clade', 'clade 21G', '21G clade', 'clade 21H', '21H clade', 'clade 21I', '21I clade', 'clade 21J', '21J clade', 'clade 21K', '21K clade', 'clade 21L', '21L clade', 'clade 21M', '21M clade', 'cla

In [None]:
# Combine Pango and Nextstrain terms and their lexical variants to produce the update list
all_terms = list(unique_lineages) + lexical_variants + list(unique_taxa) + list(terms) + list(unique_clades) + clade_variants
print("All terms combined")
print(len(all_terms))

# Save to CSV
new_df = pd.DataFrame({'term': all_terms})
new_df.to_csv("variants_addition20250319.csv", index=False)

# Display the first few terms
print(new_df.tail())

All terms combined
2633618
              term
2633613  24H clade
2633614  clade 24I
2633615  24I clade
2633616  clade 25A
2633617  25A clade


In [None]:
# Extract original variant dictionary

# URL of the published variant file (raw version of the text file)
url = "https://raw.githubusercontent.com/Aitslab/corona/master/manuscript_v2/Supplemental_file3.txt"

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open("Supplemental_file3.txt", "wb") as file:
        file.write(response.content)
    print("Download complete: Supplemental_file3.txt")
else:
    print("Failed to download file")

# Load the text file into a pandas DataFrame
original_df = pd.read_csv("Supplemental_file3.txt", header=None, names=["term"])
print("Original length of original variant list")
print(len(original_df))
print(original_df.tail())

# Combine original dataframe with update
merged_df = pd.concat([new_df, original_df], ignore_index=True)

# Remove duplicates
merged_df_unique = merged_df.drop_duplicates()
print("Length of updated variant list")
print(len(merged_df_unique))

# Save to CSV
merged_df_unique.to_csv("variants_v2.txt", index=False, header=False)

# Display the first few terms
print(merged_df_unique.tail())

Download complete: Supplemental_file3.txt
Original length of original variant list
5330
            term
5325  GH/501Y.V2
5326  GR/501Y.V3
5327   G/478K.V1
5328  GR/452Q.V1
5329     GH/490R
Length of updated variant list
2633758
               term
2638943  GH/501Y.V2
2638944  GR/501Y.V3
2638945   G/478K.V1
2638946  GR/452Q.V1
2638947     GH/490R


In [None]:
# Define the list of terms to check
terms_to_check = ["JN.1", "JN.1.18", "KP.2", "KP.3", "KP.3.1.1", "LB.1", "XEC", "LP.8.1", "BA.2*"]

# Check which terms are in all_terms
present_terms = {term: term in all_terms for term in terms_to_check}

# Print results
for term, present in present_terms.items():
    print(f"{term}: {'Present' if present else 'Not Present'}")

JN.1: Present
JN.1.18: Present
KP.2: Present
KP.3: Present
KP.3.1.1: Present
LB.1: Present
XEC: Present
LP.8.1: Present
BA.2*: Present
