In [1]:
from filtering_functions import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.base import clone
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_decomposition import CCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2

import warnings

imput_file_path = "/Users/joesouber/OneDrive - University of Bristol/MSc Data Science/data science mini project/dsmp-2024-group-13/vdjdb_full.txt"
data = pd.read_csv(imput_file_path, sep='\t')
df = pd.DataFrame(data)
df_cleaned = preprocess_data(df, relevant_columns=None, drop_duplicates=True)


  data = pd.read_csv(imput_file_path, sep='\t')


# SETE functions: making feature matrix based on cdr3 chain. kmer approach.



In [2]:

# Function to generate k-mers from a single sequence
def generate_kmers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence)-k+1)]

def create_features_matrix(df, include_alpha=True, include_beta=True, alpha_col='cdr3.alpha', beta_col='cdr3.beta', label_col='antigen.epitope', k=3):
    """
    Create a feature matrix from sequences using k-mers from cdr3.alpha and/or cdr3.beta sequences, extract labels, 
    return a k-mer count dictionary, and a list of unique epitope names.
    
    Parameters:
    - df: DataFrame containing sequences and labels.
    - include_alpha: Boolean, whether to include cdr3.alpha sequences in the analysis.
    - include_beta: Boolean, whether to include cdr3.beta sequences in the analysis.
    - alpha_col: Column name with cdr3.alpha sequences.
    - beta_col: Column name with cdr3.beta sequences.
    - label_col: Column name with labels.
    - k: k-mer length.
    
    Returns:
    - X: Feature matrix.
    - y: Labels array.
    - feature_names: Unique k-mers used as features.
    - kmer_count_dict: Dictionary of k-mer counts.
    - epitope_names: List of unique epitope names.

    example usage:

X, y, kmer_names, kmer_count_dict, epitope_names = create_features_with_options(df_cleaned, include_alpha=False, 
                                                                include_beta=True, alpha_col='cdr3.alpha', 
                                                                beta_col='cdr3.beta', label_col='antigen.epitope', k=3)
    """
    # Filter rows where label is missing
    filtered_df = df.dropna(subset=[label_col])
    
    # Initialize documents for CountVectorizer and k-mer count dictionary
    kmer_docs = []  
    kmer_count_dict = {}
    
    # Process sequences based on inclusion flags
    for _, row in filtered_df.iterrows():
        kmers = []
        if include_alpha and pd.notna(row[alpha_col]):
            alpha_seq = row[alpha_col]
            kmers += generate_kmers(alpha_seq, k)
        if include_beta and pd.notna(row[beta_col]):
            beta_seq = row[beta_col]
            kmers += generate_kmers(beta_seq, k)
        
        # Concatenate k-mers into a single string for vectorization
        kmer_docs.append(' '.join(kmers))
        
    
        # Count occurrences of each k-mer
        for kmer in kmers:
            kmer_count_dict[kmer] = kmer_count_dict.get(kmer, 0) + 1
    
    # Vectorize k-mer documents into a feature matrix
    vectorizer = CountVectorizer(analyzer='word', token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(kmer_docs).toarray()
    
    # Extract labels
    y = filtered_df[label_col].values
    
    # Get unique k-mer names used in the matrix
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract unique epitope names
    epitope_names = filtered_df[label_col].unique().tolist()
    
    return X, y, feature_names, kmer_count_dict, epitope_names



# Create the feature matrix, labels, and k-mer list based on specified chains
X, y, kmer_names, kmer_count_dict, epitope_names = create_features_matrix(df_cleaned, include_alpha=False, include_beta=True, alpha_col='cdr3.alpha', beta_col='cdr3.beta', label_col='antigen.epitope', k=3)


In [4]:
# Output shapes to verify
print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)
print("First few epitope names:", epitope_names[:5])
print("First few k-mer names:", kmer_names[:5])
print("First few k-mer counts:", kmer_count_dict)

Feature matrix shape: (29833, 5990)
Labels shape: (29833,)
First few epitope names: ['FLKEKGGL', 'FLKEQGGL', 'FLKETGGL', 'FLKEMGGL', 'ELAGIGILTV']
First few k-mer names: ['aaa' 'aad' 'aae' 'aaf' 'aag']
First few k-mer counts: {'CAS': 25033, 'ASS': 21594, 'SSY': 2019, 'SYL': 77, 'YLP': 7, 'LPG': 118, 'PGQ': 405, 'GQG': 1024, 'QGD': 117, 'GDH': 40, 'DHY': 12, 'HYS': 16, 'YSN': 160, 'SNQ': 505, 'NQP': 944, 'QPQ': 1554, 'PQH': 1643, 'QHF': 1666, 'SSF': 1408, 'SFE': 43, 'FEP': 7, 'EPG': 71, 'QGF': 77, 'GFY': 68, 'FYS': 34, 'SYE': 1448, 'YEP': 4, 'GQV': 75, 'QVS': 27, 'VSH': 5, 'SHY': 167, 'SSA': 746, 'SAL': 118, 'ALA': 65, 'LAS': 146, 'ASL': 106, 'SLN': 143, 'LNE': 126, 'NEQ': 3543, 'EQF': 4677, 'QFF': 4761, 'SFT': 100, 'FTP': 4, 'TPY': 24, 'PYN': 145, 'YNE': 1931, 'SSP': 2661, 'SPQ': 121, 'PQG': 101, 'QGL': 198, 'GLG': 365, 'LGT': 368, 'GTE': 551, 'TEA': 2598, 'EAF': 3248, 'AFF': 3387, 'CAE': 10, 'AEG': 48, 'EGQ': 116, 'GFV': 4, 'FVG': 16, 'VGQ': 122, 'GQP': 230, 'SLR': 395, 'LRS': 50, 'RS