In [32]:
import pandas as pd
import scipy
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from imblearn.over_sampling import SMOTE

In [3]:
# for testing modularized functions
df_path = "" # edit this to select dataset
df = pd.read_csv(df_path)

In [29]:
def remove_columns(df: pd.DataFrame):
    '''
    Removes transcript_name and gene_id columns.
    Add or remove columns as required.
    '''
    df1 = df.drop(["transcript_name", "gene_id", "nucleotide_seq"], axis=1)
    return df1

def trigram_tokenize(df:pd.DataFrame, n):
    '''
    to apply trigram on column "nucleotide_seq" using TfidfVectorizer
    '''
    corpus = df['nucleotide_seq']
    v_nucleotide_seq = TfidfVectorizer(analyzer='char', ngram_range=(n,n)).fit_transform(corpus)
    v = pd.DataFrame(scipy.sparse.csr_matrix.toarray(v_nucleotide_seq)) # convert sparse matrix to array

    # creating dictionary to easily add vectorized sequence features as columns to dataframe
    new_nucleotide_data = dict() 
    for i in range(v.shape[1]):
        key = "s" + str(i)
        new_nucleotide_data[key] = v[i]

    df_final = df.assign(**new_nucleotide_data)
    
    return df_final #returns dataframe with vectorized nucleotide features as columns

def data_split(df:pd.DataFrame):
    '''
    splits data by gene into train and test sets according to the percentage given.
    '''
    X = df.drop(["label"], axis=1)
    y = df["label"]

    gss = GroupShuffleSplit(n_splits=2, random_state=0, test_size=0.2)
    train_i, test_i = next(gss.split(X,y,groups=X.gene_id))

    X_train = X.loc[train_i]
    y_train = y.loc[train_i]

    X_test = X.loc[test_i]
    y_test = y.loc[test_i]

    return X_train, X_test, y_train, y_test

def standardize_data(X_train, X_test):
    '''
    Standardizes numerical features while leaving non-numerical features unchanged.
    Assumes X is pandas DataFrames.
    '''
    x_columns = X_train.columns
    # Identify numerical and categorical columns
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(exclude=['int64', 'float64']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', 'passthrough', categorical_cols)  # Leave categorical columns unchanged
        ])
    
    # Apply the column transformer
    X_train_scaled = preprocessor.fit_transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = x_columns)
    X_test_scaled = preprocessor.transform(X_test)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns = x_columns)
    return X_train_scaled, X_test_scaled

def synthetic_oversampling(X_train, y_train):
    '''
    Uses SMOTE to oversample the minority class.
    '''
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    return X_train_resampled, y_train_resampled


# Stringing together 

In [30]:
def pipeline_all(df):
    """
    Purpose: Setting up the pipeline? to drop unnecessary columns, followed by vectorizing, splitting, standardising and oversampling
    """
    # Vectorization
    tokenized_df = trigram_tokenize(df, 3)

    # Train-test split and remove the "transcript_name", "gene_id", "nucleotide_seq" columns
    Xtrain, Xtest, ytrain, ytest = data_split(tokenized_df)
    Xtrain = remove_columns(Xtrain)
    Xtest = remove_columns(Xtest)

    # Scaling Xtrain and Xtest with Scaler fitted on Xtrain
    Xtrain_scaled, Xtest_scaled = standardize_data(Xtrain, Xtest)

    # SMOTE on training set 
    Xtrain_resampled, ytrain_resampled = synthetic_oversampling(Xtrain_scaled, ytrain)
    
    return Xtrain_resampled, Xtest_scaled, ytrain_resampled, ytest

In [31]:
X_train, X_test, y_train, y_test = pipeline_all(df)
X_train.to_csv("../dataset/Xtrain.csv", index = False)
y_train.to_csv("../dataset/ytrain.csv", index = False)