# **CPP2Vec Uptake Efficiency Prediction Tutorial Notebook**

### Load all necessary libraries and the pre-trained models.

In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import pickle

# Load CPP2Vec's pretrained models

# Word2Vec model
w2v = Word2Vec.load("W2V_Uptake-Efficiency.pt")

# ML model
with open("ML_Uptake-Efficiency.asv", "rb") as f:
    ml_model = pickle.load(f)

print("\nWord2Vec and Machine Learning models loaded successfully!\n")

### Define functions to process peptide sequences, generate embeddings and calculate evaluation metrics for CPP prediction.

1. pad_sequences: Pads or truncates sequences to fixed length (seqwin)
2. sep_word: Splits sequences into overlapping k-mers
3. emb_seq_w2v: Converts sequences into numeric vectors using Word2Vec
4. evaluate_metrics: Computes Sensitivity, Specificity, Accuracy, AUC, MCC, F1-Score, etc.

In [None]:
def pad_sequences(seq_list, seqwin=61):
    padded = []
    for seq in seq_list:
        seq = seq.strip()
        if len(seq) > seqwin:
            seq = seq[:seqwin]
        padded.append(seq.ljust(seqwin, "X"))
    return padded

def sep_word(sequences, k=2):
    return [[seq[i:i+k] for i in range(len(seq)-k+1)] for seq in sequences]

def emb_seq_w2v(seq_list, w2v_model, k=2):
    num_seq = len(seq_list)
    for j, seq in enumerate(seq_list):
        enc = np.array([w2v_model.wv[seq[i:i+k]] for i in range(len(seq)-k+1)])
        if j == 0:
            seq_emb = enc
        else:
            seq_emb = np.append(seq_emb, enc, axis=0)
    seq_emb = seq_emb.reshape(num_seq, -1)
    return seq_emb

def evaluate_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)

    tp = ((y_true==1) & (y_pred==1)).sum()
    tn = ((y_true==0) & (y_pred==0)).sum()
    fp = ((y_true==0) & (y_pred==1)).sum()
    fn = ((y_true==1) & (y_pred==0)).sum()

    sensitivity = tp / (tp + fn) if (tp+fn)>0 else 0
    specificity = tn / (tn + fp) if (tn+fp)>0 else 0
    mcc = ((tp*tn - fp*fn) / 
           np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
          ) if (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)>0 else 0

    return {
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "Accuracy": metrics.accuracy_score(y_true, y_pred),
        "AUC": metrics.roc_auc_score(y_true, y_prob),
        "MCC": mcc,
        "Precision": metrics.precision_score(y_true, y_pred),
        "Recall": metrics.recall_score(y_true, y_pred),
        "F1-Score": metrics.f1_score(y_true, y_pred),
        "AUPRC": metrics.average_precision_score(y_true, y_prob)
    }


### Load Sequences to be Predicted from a TXT File.

**Acceptable input formats:**

| Format | Example | Notes |
|--------|---------|-------|
| **Sequences only** (no labels) | RRRRRRRGGIYLATALAKWALKQ<br>IYLATALAKWALKQGGRRRRRRR<br>KLAKLAKKLAKLAKGGRRRRRRR<br>SHMMGEFWGDEDMCYRHQRSYET | Each line contains a single peptide sequence. Metrics cannot be calculated without labels. Predictions only. |
| **Sequences with labels** | RRRRRRRGGIYLATALAKWALKQ,1<br>KLAKLAKKLAKLAKGGRRRRRRR,1<br>FQAYPCITAYKVMYID,0<br>SHMMGEFWGDEDMCYRHQRSYET,0 | Each line contains a sequence and a true label separated by a comma. Metrics will be calculated automatically. |
<br>
**Labels: 1 = High, 0 = Low**


In [None]:
input_file = "CPP_Uptake_Input.txt" 

# Read sequences (and optional labels)
seqs = []
true_labels = []

with open(input_file, "r") as f:
    lines = f.read().strip().split("\n")
    for line in lines:
        line = line.strip()
        if not line:
            continue
        parts = line.split(",")
        seqs.append(parts[0].strip())
        if len(parts) > 1:
            true_labels.append(int(parts[1].strip()))

print(f"{len(seqs)} sequences loaded from {input_file}.")
if true_labels:
    print("Labels detected for metrics calculation.")

print("\nA preview of the first 5 sequences:\n")    
seqs[:5] 

### Preprocess sequences, generate embeddings, apply PCA, predict with SVM, and save results to CSV.


In [None]:
seqwin = 61
kmer = 3

# Pad sequences
padded = pad_sequences(seqs, seqwin)

# Generate embeddings using Word2Vec
embedded = emb_seq_w2v(padded, w2v, kmer)

# Predict probabilities using SVM
prob = ml_model.predict_proba(embedded)[:, 1]

# Convert probabilities to labels
pred_labels = ["High" if p >= 0.5 else "Low" for p in prob]

# Results table
df_results = pd.DataFrame({
    "Sequence": seqs,
    "Predicted_Label": pred_labels,
    "Probability": prob
})

print(df_results)

# Save to CSV
output_file = "CPP2Vec_Uptake_Predictions.csv"
df_results.to_csv(output_file, index=False)
print(f"\nPredictions saved to '{output_file}'")


### If true labels are provided in the TXT file, evaluation metrics are calculated.

In [None]:
if true_labels:
    from sklearn import metrics

    y_true = np.array(true_labels)
    y_prob = prob
    y_pred = np.array([1 if p >= 0.5 else 0 for p in y_prob])

    tp = ((y_true==1) & (y_pred==1)).sum()
    tn = ((y_true==0) & (y_pred==0)).sum()
    fp = ((y_true==0) & (y_pred==1)).sum()
    fn = ((y_true==1) & (y_pred==0)).sum()
    mcc = ((tp*tn - fp*fn) / np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))) \
          if (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)>0 else 0

    metrics_res = {
        "Sensitivity": tp / (tp+fn) if (tp+fn)>0 else 0,
        "Specificity": tn / (tn+fp) if (tn+fp)>0 else 0,
        "Accuracy": metrics.accuracy_score(y_true, y_pred),
        "AUC": metrics.roc_auc_score(y_true, y_prob),
        "MCC": mcc,
        "Precision": metrics.precision_score(y_true, y_pred),
        "Recall": metrics.recall_score(y_true, y_pred),
        "F1-Score": metrics.f1_score(y_true, y_pred),
        "AUPRC": metrics.average_precision_score(y_true, y_prob)
    }

    print("~~~ Evaluation Metrics ~~~\n")
    for k, v in metrics_res.items():
        print(f"{k}: {v:.4f}")
else:
    print("No true labels provided. Metrics calculation skipped.")