Check how many sites can have a better mutation; And the number of total positive mutation

In [2]:
import pandas as pd

# load your mutation results
df = pd.read_csv("Thera_point_mutation.csv", sep="\t")

summary = []

for sample in df["sample"].unique():
    subset = df[df["sample"] == sample]
    positive = subset[subset["delta_log_likelihood"] > 0]
    
    n_positions_with_positive_delta = positive[["chain", "pos"]].drop_duplicates().shape[0]
    n_total_positive_mutations = positive.shape[0]
    
    best_per_position = (
        positive
        .groupby(["chain", "pos"])
        ["delta_log_likelihood"]
        .max()
        .mean()
    )
    
    summary.append({
        "sample": sample,
        "n_positions_with_positive_delta": n_positions_with_positive_delta,
        "n_total_positive_mutations": n_total_positive_mutations,
        "mean_best_positive_delta": best_per_position
    })

summary_df = pd.DataFrame(summary)
summary_df.to_csv("Thera_point_mutation_summary.csv", sep="\t", index=False)
print("Wrote Thera_point_mutation_summary.csv with summary statistics.")


Wrote Thera_point_mutation_summary.csv with summary statistics.


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# load your results
df = pd.read_csv("/home/eva/0_point_mutation/Thera_point_mutation.csv", sep="\t")

# sites of interest
sites = [('VH', 'H103'), ('VH', 'H104'), ('VH', 'H68')]

sns.set_theme(style="ticks")

n_sites = len(sites)
ncols = 2
nrows = (n_sites + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, nrows * 4))
axes = axes.flatten()

for ax, (chain, pos) in zip(axes, sites):
    site_df = df[(df["chain"] == chain) & (df["imgt_pos"] == pos)]
    if site_df.empty:
        ax.set_title(f"{chain} {pos} (no data)")
        continue

    # violin
    sns.violinplot(
        data=site_df,
        x="mt",
        y="mut_log_likelihood",
        palette="vlag",
        inner=None,
        scale="width",
        cut=0,
        ax=ax
    )

    # scatter on top
    sns.stripplot(
        data=site_df,
        x="mt",
        y="mut_log_likelihood",
        color="k",
        size=1,
        alpha=0.8,
        jitter=True,
        ax=ax
    )

    ax.axhline(0, linestyle="--", color="gray")
    ax.set_title(f"{chain} position {pos} mutation scan")
    ax.set_xlabel("Mutant amino acid")
    ax.set_ylabel("mut_log_likelihood")

sns.despine()
plt.tight_layout()
plt.savefig("multi_site_mutscan_violin.png", dpi=300)
plt.close()
print("Wrote multi_site_mutscan_violin.png")


Wrote multi_site_mutscan_violin.png


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# load results
df = pd.read_csv("/home/eva/0_point_mutation/Thera_point_mutation.csv", sep="\t")
df['mut_log_likelihood'].isna()]

Unnamed: 0,chain,pos,wt,mt,delta_log_likelihood,mut_log_likelihood,wt_log_likelihood,imgt_pos,mutation_label,sample
2380,VL,1,D,A,4.629721,,,L1,D-L1-A,Abagovomab
2381,VL,1,D,C,7.897365,,,L1,D-L1-C,Abagovomab
2382,VL,1,D,D,0.000000,,,L1,D-L1-D,Abagovomab
2383,VL,1,D,E,-0.464841,,,L1,D-L1-E,Abagovomab
2384,VL,1,D,F,2.606165,,,L1,D-L1-F,Abagovomab
...,...,...,...,...,...,...,...,...,...,...
5092155,VL,106,I,S,0.037730,,,L126,I-L126-S,Zubotamig
5092156,VL,106,I,T,3.459853,,,L126,I-L126-T,Zubotamig
5092157,VL,106,I,V,-1.794743,,,L126,I-L126-V,Zubotamig
5092158,VL,106,I,W,-1.554804,,,L126,I-L126-W,Zubotamig


: 

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import re

# Load
df = pd.read_csv("/home/eva/0_point_mutation/Thera_point_mutation.csv", sep="\t")

print("Columns in dataframe:", df.columns.tolist())

sns.set_theme(style="ticks")

unique_sites = df[["chain", "kabat_pos"]].drop_duplicates()

# extract real chain and number
def parse_site(row):
    kabat_pos = str(row["kabat_pos"])
    chain_orig = str(row["chain"]).upper()
    # extract chain from kabat_pos, e.g. H100A
    m = re.match(r"([HL])(\d+)([A-Za-z]?)", kabat_pos)
    if m:
        chain_code = m.group(1)
        pos_num = int(m.group(2))
        pos_letter = m.group(3) or ""
        return chain_code, pos_num, pos_letter
    else:
        # fallback
        return "pos", 0, ""
    
def sort_key(row):
    chain_code, pos_num, pos_letter = parse_site(row)
    if chain_code == "H":
        return (0, pos_num, pos_letter)
    elif chain_code == "L":
        return (1, pos_num, pos_letter)
    elif chain_code == "pos":
        return (2, 0, "")
    else:
        return (3, pos_num, pos_letter)

# sort
unique_sites_sorted = unique_sites.copy()
unique_sites_sorted["sort_order"] = unique_sites_sorted.apply(sort_key, axis=1)
unique_sites_sorted = unique_sites_sorted.sort_values(by="sort_order")

# check order
print("Final plotting order:")
print(unique_sites_sorted[["chain", "kabat_pos"]])

# plotting
with PdfPages("all_sites_mutscan.pdf") as pdf:
    for _, row in unique_sites_sorted.iterrows():
        chain = row["chain"]
        pos = row["kabat_pos"]
        site_df = df[(df["chain"] == chain) & (df["kabat_pos"] == pos)]
        
        plt.figure(figsize=(7, 5))
        
        if site_df.empty or "mut_log_likelihood" not in site_df.columns:
            plt.title(f"{chain} {pos} (no data or missing column)")
            pdf.savefig()
            plt.close()
            continue
        
        sns.violinplot(
            data=site_df,
            x="mt",
            y="mut_log_likelihood",
            palette="vlag",
            inner=None,
            scale="width",
            cut=0
        )
        sns.stripplot(
            data=site_df,
            x="mt",
            y="mut_log_likelihood",
            color="k",
            alpha=0.5,
            size=2,
            jitter=True
        )
        
        plt.axhline(0, linestyle="--", color="gray")
        plt.title(f"{chain} position {pos}", fontsize=12)
        plt.xlabel("Mutant amino acid")
        plt.ylabel("mut_log_likelihood")
        sns.despine()
        plt.tight_layout()
        pdf.savefig()
        plt.close()

print(f"Wrote all_sites_mutscan.pdf with {len(unique_sites_sorted)} pages")


Columns in dataframe: ['chain', 'pos', 'wt', 'mt', 'delta_log_likelihood', 'mut_log_likelihood', 'wt_log_likelihood', 'kabat_pos', 'mutation_label', 'sample']
Final plotting order:
     chain kabat_pos
0       VH        H1
19      VH        H2
38      VH        H3
57      VH        H4
76      VH        H5
...    ...       ...
4047    VL      L102
4066    VL      L103
4085    VL      L104
4104    VL      L105
4123    VL      L106

[266 rows x 2 columns]
Wrote all_sites_mutscan.pdf with 266 pages
