In [6]:
from filtering_functions import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.base import clone
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_decomposition import CCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2

import warnings

imput_file_path = "/Users/joesouber/OneDrive - University of Bristol/MSc Data Science/data science mini project/dsmp-2024-group-13/vdjdb_full.txt"
data = pd.read_csv(imput_file_path, sep='\t')
df = pd.DataFrame(data)
df_cleaned = preprocess_data(df, relevant_columns=None, drop_duplicates=True)


  data = pd.read_csv(imput_file_path, sep='\t')


(           cdr3.alpha          v.alpha    j.alpha             cdr3.beta  \
 0       CIVRAPGRADMRF      TRAV26-1*01  TRAJ43*01  CASSYLPGQGDHYSNQPQHF   
 1      CAVPSGAGSYQLTF        TRAV20*01  TRAJ28*01   CASSFEPGQGFYSNQPQHF   
 2         CAVKASGSRLT         TRAV2*01        NaN  CASSYEPGQVSHYSNQPQHF   
 3       CAYRPPGTYKYIF  TRAV38-2/DV8*01  TRAJ40*01        CASSALASLNEQFF   
 4       CIVRAPGRADMRF      TRAV26-1*01  TRAJ43*01  CASSYLPGQGDHYSNQPQHF   
 ...               ...              ...        ...                   ...   
 29828   CMDEGGSNYKLTF      TRAV26-1*01  TRAJ53*01         CASSVRSTDTQYF   
 29829     CSLYNNNDMRF      TRAV26-1*01  TRAJ43*01         CASSLRYTDTQYF   
 29830   CALSTDSWGKLQF         TRAV6*01  TRAJ24*01       CASSPGQGGDNEQFF   
 29831    CAPQGATNKLIF      TRAV12-2*01  TRAJ32*01       CASSLGAGGQETQYF   
 29832  CLVGGSGGYNKLIF         TRAV4*01   TRAJ4*01         CASSSTAQETQYF   
 
             v.beta d.beta      j.beta      species           mhc.a  \
 0        TRBV1

# SETE functions: making feature matrix based on cdr3 chain. kmer approach.



In [7]:

# Function to generate k-mers from a single sequence
def generate_kmers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence)-k+1)]

def create_features_matrix(df, include_alpha=True, include_beta=True, alpha_col='cdr3.alpha', beta_col='cdr3.beta', label_col='antigen.epitope', k=3):
    """
    Create a feature matrix from sequences using k-mers from cdr3.alpha and/or cdr3.beta sequences, extract labels, 
    return a k-mer count dictionary, and a list of unique epitope names.
    
    Parameters:
    - df: DataFrame containing sequences and labels.
    - include_alpha: Boolean, whether to include cdr3.alpha sequences in the analysis.
    - include_beta: Boolean, whether to include cdr3.beta sequences in the analysis.
    - alpha_col: Column name with cdr3.alpha sequences.
    - beta_col: Column name with cdr3.beta sequences.
    - label_col: Column name with labels.
    - k: k-mer length.
    
    Returns:
    - X: Feature matrix.
    - y: Labels array.
    - feature_names: Unique k-mers used as features.
    - kmer_count_dict: Dictionary of k-mer counts.
    - epitope_names: List of unique epitope names.

    example usage:

X, y, kmer_names, kmer_count_dict, epitope_names = create_features_with_options(df_cleaned, include_alpha=False, 
                                                                include_beta=True, alpha_col='cdr3.alpha', 
                                                                beta_col='cdr3.beta', label_col='antigen.epitope', k=3)
    """
    # Filter rows where label is missing
    filtered_df = df.dropna(subset=[label_col])
    
    # Initialize documents for CountVectorizer and k-mer count dictionary
    kmer_docs = []  
    kmer_count_dict = {}
    
    # Process sequences based on inclusion flags
    for _, row in filtered_df.iterrows():
        kmers = []
        if include_alpha and pd.notna(row[alpha_col]):
            alpha_seq = row[alpha_col]
            kmers += generate_kmers(alpha_seq, k)
        if include_beta and pd.notna(row[beta_col]):
            beta_seq = row[beta_col]
            kmers += generate_kmers(beta_seq, k)
        
        # Concatenate k-mers into a single string for vectorization
        kmer_docs.append(' '.join(kmers))
        
    
        # Count occurrences of each k-mer
        for kmer in kmers:
            kmer_count_dict[kmer] = kmer_count_dict.get(kmer, 0) + 1
    
    # Vectorize k-mer documents into a feature matrix
    vectorizer = CountVectorizer(analyzer='word', token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(kmer_docs).toarray()
    
    # Extract labels
    y = filtered_df[label_col].values
    
    # Get unique k-mer names used in the matrix
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract unique epitope names
    epitope_names = filtered_df[label_col].unique().tolist()
    
    return X, y, feature_names, kmer_count_dict, epitope_names

# Example usage:
# Load your DataFrame
# df = pd.read_csv('path_to_your_file.csv')  # Update the path accordingly

# Create the feature matrix, labels, and k-mer list based on specified chains
X, y, kmer_names, kmer_count_dict, epitope_names = create_features_matrix(df_cleaned, include_alpha=False, include_beta=True, alpha_col='cdr3.alpha', beta_col='cdr3.beta', label_col='antigen.epitope', k=3)


NameError: name 'df_cleaned' is not defined