In [163]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import cross_val_score,cross_val_predict
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [164]:
import joblib
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.linear_model import Ridge

In [238]:
import os
import pandas as pd
import numpy as np

def encode_gcPBM_data(fileprefix, 
                      data_dir="/SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS", 
                    features=("1mer",), w=0, sample_size=None):
    try:
        # Load 1mer data
        ohe_file = os.path.join(data_dir, "MAX_gcPBM_dir", f"{fileprefix}_OHE_1mer.tsv")
        ohe_df = pd.read_csv(ohe_file, header=None, sep="\t")
        
        if sample_size:
            ohe_df = ohe_df.sample(n=sample_size, random_state=42)
        
        ohe_data = {row[0]: {'aff': float(row[0].split('__')[1]), 
                             'feat': row.iloc[1:].tolist()} for _, row in ohe_df.iterrows()}
        
        print(f"1mer data loaded from {ohe_file}... \n")
    
    except FileNotFoundError as fe:
        raise FileNotFoundError(f"File {ohe_file} not found: {fe}")

    combined_data = ohe_data.copy()

    # Load additional features if specified
    if len(features) > 1:
        for feature in features[1:]:
            print(f"Will load the additional feature {feature}")
            try:
                flex_feat_file = os.path.join(data_dir, "MAX_gcPBM_dir", f"{fileprefix}_w{w}nt_{feature}.tsv")
                flex_df = pd.read_csv(flex_feat_file, sep="\t", header=None)
                
                if sample_size:
                    flex_df = flex_df[flex_df[0].isin(ohe_df[0])]

                for ids in ohe_data.keys():
                    flex_df_row = flex_df[flex_df[0] == ids]
                    if not flex_df_row.empty:
                        flex_df_feat = np.array(flex_df_row.iloc[0, 1:].tolist()) # first column is id

                        # Scale the flex features
                        min_value = np.min(flex_df_feat)
                        std_value = np.std(flex_df_feat)
                        if std_value != 0:
                            scaled_flex_feat = (flex_df_feat - min_value) / std_value
                        else:
                            scaled_flex_feat = flex_df_feat  # If std is 0, scaling is not possible

                        # Calculate interaction terms for this feature using NumPy
                        interaction_terms = scaled_flex_feat[:-1] * scaled_flex_feat[1:]

                        combined_data[ids]['feat'].extend(scaled_flex_feat.tolist() + interaction_terms.tolist())
                print(f"{feature} data loaded from {flex_feat_file}... \n")
            except FileNotFoundError as fe:
                raise FileNotFoundError(f"File {flex_feat_file} not found: {fe}")
            except Exception as e:
                raise Exception(f"Error in extending feature {feature} from {flex_feat_file}: {e}")

    # Define X, y
    X = np.array([combined_data[x]['feat'] for x in combined_data.keys()])
    y = np.array([combined_data[x]['aff'] for x in combined_data.keys()])
    
    # Scale y to the range [0, 1] using NumPy
    y_min = np.min(y)
    y_max = np.max(y)
    y = (y - y_min) / (y_max - y_min)
    
    return X, y


In [259]:
import os
import pandas as pd
import numpy as np

def encode_gcPBM_data_with_varying_len(fileprefix,
                                       data_dir="/SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS", 
                                       features=("1mer",), w=0, sample_size=None):
    try:
        # Load 1mer data
        ohe_file = os.path.join(data_dir, "gcPBM", "1_mer", f"{fileprefix}_update_1mer.tsv")
        ohe_df = pd.read_csv(ohe_file, header=None, sep="\t")
        
        if sample_size:
            ohe_df = ohe_df.sample(n=sample_size, random_state=42)
            
        ohe_data = {row[0]: {'aff': float(row[0].split('__')[1]), 
                             'feat': row.iloc[1:].tolist()} for _, row in ohe_df.iterrows()}
        
        print(f"1mer data loaded from {ohe_file}... \n")
    
    except FileNotFoundError as fe:
        raise FileNotFoundError(f"File {ohe_file} not found: {fe}")

    combined_data = ohe_data.copy()

    # Load additional features if specified
    if len(features) > 1:
        for feature in features[1:]:    
            try:
                if feature == '2mer':
                    print(f"Will load the additional feature {feature}")
                    add_df = os.path.join(data_dir, 'gcPBM', '2mer', f'{fileprefix}_update_2mer.tsv')
                
                elif feature == '3mer':
                    print(f"Will load the additional feature {feature}")
                    add_df = os.path.join(data_dir, 'gcPBM', '3mer', f'{fileprefix}_update_3mer.tsv')
                
                else:
                    print(f"Will load the additional feature {feature}")
                    add_df = os.path.join(data_dir, "gcPBM",  "DNAFlex" ,f"{fileprefix}_{feature}_{w}.tsv")
                
                flex_df = pd.read_csv(add_df, sep="\t", header=None)

                if sample_size:
                    flex_df = flex_df[flex_df[0].isin(ohe_df[0])]

                for ids in ohe_data.keys():
                    flex_df_row = flex_df[flex_df[0] == ids]
                    if not flex_df_row.empty:
                        flex_df_feat = np.array(flex_df_row.iloc[0, 1:].tolist()) # first column is id

                        # Scale the flex features
                        min_value = np.min(flex_df_feat)
                        std_value = np.std(flex_df_feat)
                        if std_value != 0:
                            scaled_flex_feat = (flex_df_feat - min_value) / std_value
                        else:
                            scaled_flex_feat = flex_df_feat  # If std is 0, scaling is not possible

                        # Calculate interaction terms for this feature using NumPy
                        interaction_terms = scaled_flex_feat[:-1] * scaled_flex_feat[1:]

                        combined_data[ids]['feat'].extend(scaled_flex_feat.tolist() + interaction_terms.tolist())
                print(f"{feature} data loaded from {add_df}... \n")
            except FileNotFoundError as fe:
                raise FileNotFoundError(f"File {add_df} not found: {fe}")
            except Exception as e:
                raise Exception(f"Error in extending feature {feature} from {add_df}: {e}")

    # Define X, y
    X = np.array([combined_data[x]['feat'] for x in combined_data.keys()])
    y = np.array([combined_data[x]['aff'] for x in combined_data.keys()])
    
    # Scale y to the range [0, 1] using NumPy
#     y_min = np.min(y)
#     y_max = np.max(y)
#     y = (y - y_min) / (y_max - y_min)
    
    return X, y

In [260]:

def nested_ridge_cv(X, y, fileid, pos, features, outer_cv_folds=10, inner_cv_folds=5):
    alphas = np.linspace(1, 1000, 2000)
    outer_cv = KFold(n_splits=outer_cv_folds, shuffle=True, random_state=42)
    outer_results = []

    for train_idx, test_idx in outer_cv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        inner_cv = KFold(n_splits=inner_cv_folds, shuffle=True, random_state=42)
        ridge_cv_model = make_pipeline(StandardScaler(with_mean=False),
                                       RidgeCV(alphas=alphas, cv=inner_cv, scoring='r2'))
        ridge_cv_model.fit(X_train, y_train)
        best_alpha = ridge_cv_model.named_steps['ridgecv'].alpha_
        y_test_pred = ridge_cv_model.predict(X_test)
        r2_test = r2_score(y_test, y_test_pred)
        mse_test = mean_squared_error(y_test, y_test_pred)
        mae_test = mean_absolute_error(y_test, y_test_pred)
        result = {
            'fileid': fileid,
            'pos': pos,
            'features': features,
            'r2_test': r2_test,
            'mse_test': mse_test,
            'mae_test': mae_test,
            'best_alpha': best_alpha
        }
        outer_results.append(result)
    
    # Calculate the mean best alpha from outer results
    best_alpha = np.mean([res['best_alpha'] for res in outer_results])
    
    # Train the final model on the entire dataset with the best alpha
    final_model = make_pipeline(StandardScaler(with_mean=False),
                                RidgeCV(alphas=[best_alpha], cv=inner_cv, scoring='r2'))
    final_model.fit(X, y)
    
    return {
#         'model': final_model,
#         'best_alpha': best_alpha,
        'outer_results': outer_results
    }

In [263]:
from itertools import product


ohe = ["1mer"]
# all flexixbility features
# features = ['DNaseI', 'NPP', 'twistDisp', 'stiffness', 'trxDi']
# feature_sets = [('1mer', )] + [x for x in product(ohe, features)]
features = [('1mer',), 
            ('1mer', '2mer'),
            ('1mer', '2mer', '3mer'), 
            ('1mer', "DNaseI", 'twistDisp', 'NPP', 'stiffness', 'trxDi')]


results_df_f_combined = []

#### Train model with varying length of gcPBM dataset 

In [251]:
# import os
# import pandas as pd
# import numpy as np

# def encode_gcPBM_data_with_varying_len(fileprefix,
#                                        data_dir="/SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS", 
#                                        features=("1mer",), w=0, sample_size=None):
#     try:
#         # Load 1mer data
#         ohe_file = os.path.join(data_dir, "gcPBM", "1_mer", f"{fileprefix}_1mer.tsv")
#         ohe_df = pd.read_csv(ohe_file, header=None, sep="\t")
        
#         if sample_size:
#             ohe_df = ohe_df.sample(n=sample_size, random_state=42)
#         ohe_data = {row[0]: {'aff': float(row[0].split('__')[1]), 
#                              'feat': row.iloc[1:].tolist()} for _, row in ohe_df.iterrows()}
#         print(f"1mer data loaded from {ohe_file}... \n")
    
#     except FileNotFoundError as fe:
#         raise FileNotFoundError(f"File {ohe_file} not found: {fe}")

#     combined_data = ohe_data.copy()

#     # Load additional features if specified
#     if len(features) > 1:
#         for feature in features[1:]:    
#             try:
#                 if feature == '2mer':
#                     print(f"Will load the additional feature {feature}")
#                     add_df = os.path.join(data_dir, 'gcPBM', '2mer', f'{fileprefix}_2mer.tsv')
                
#                 elif feature == '3mer':
#                     print(f"Will load the additional feature {feature}")
#                     add_df = os.path.join(data_dir, 'gcPBM', '3mer', f'{fileprefix}_3mer.tsv')
                
#                 else:
#                     print(f"Will load the additional feature {feature}")
#                     add_df = os.path.join(data_dir, "gcPBM",  "DNAFlex" ,f"{fileprefix}_{feature}_{w}.tsv")
                
                
#                 flex_df = pd.read_csv(add_df, sep="\t", header=None)

#                 if sample_size:
#                     flex_df = flex_df[flex_df[0].isin(ohe_df[0])]

#                 for ids in ohe_data.keys():
#                     flex_df_row = flex_df[flex_df[0] == ids]
#                     if not flex_df_row.empty:
#                         flex_df_feat = np.array(flex_df_row.iloc[0, 1:].tolist()) # first column is id

#                         # Scale the flex features
#                         min_value = np.min(flex_df_feat)
#                         std_value = np.std(flex_df_feat)
#                         if std_value != 0:
#                             scaled_flex_feat = (flex_df_feat - min_value) / std_value
#                         else:
#                             scaled_flex_feat = flex_df_feat  # If std is 0, scaling is not possible

#                         # Calculate interaction terms for this feature using NumPy
#                         interaction_terms = scaled_flex_feat[:-1] * scaled_flex_feat[1:]

#                         combined_data[ids]['feat'].extend(scaled_flex_feat.tolist() + interaction_terms.tolist())
#                 print(f"{feature} data loaded from {flex_feat_file}... \n")
#             except FileNotFoundError as fe:
#                 raise FileNotFoundError(f"File {flex_feat_file} not found: {fe}")
#             except Exception as e:
#                 raise Exception(f"Error in extending feature {feature} from {flex_feat_file}: {e}")

#     # Define X, y
#     X = np.array([combined_data[x]['feat'] for x in combined_data.keys()])
#     y = np.array([combined_data[x]['aff'] for x in combined_data.keys()])
    
#     # Scale y to the range [0, 1] using NumPy
#     y_min = np.min(y)
#     y_max = np.max(y)
#     y = (y - y_min) / (y_max - y_min)
    
#     return X, y


In [264]:

# List to store results
gcPBM_varying_sample_size = []

# Define the varying sample sizes you want to use
sample_sizes = [8568, 4284, 2142, 1071, 536]  # 8568 means use the full dataset

for record in ['Max']:
    print(f"Running for file: {record} \n")
    
    for sample_size in sample_sizes:
        print(f"Running with sample size: {sample_size}\n")
        
        for fset in features:
            print(f"Current feature is: {fset}...\n")
            
            # Load data with specified sample size
            X, y = encode_gcPBM_data_with_varying_len(fileprefix=record, 
                                                      features=fset, 
                                                      sample_size=sample_size)
            
            # Model
            res = nested_ridge_cv(X=X, y=y, 
                                  fileid=record,
                                  pos="all", 
                                  features=fset, 
                                  outer_cv_folds=10, 
                                  inner_cv_folds=5)
            
            print(res)
            
            # Save the trained model
            f = '1mer_Flex' if "DNaseI" in fset else '1mer'
            
            gcPBM_varying_sample_size.append((res, sample_size))

Running for file: Max 

Running with sample size: 8568

Current feature is: ('1mer',)...

1mer data loaded from /SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS/gcPBM/1_mer/Max_update_1mer.tsv... 

{'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7847093770263638, 'mse_test': 0.010003277522759549, 'mae_test': 0.0780553972795913, 'best_alpha': 24.9879939969985}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7974549045869559, 'mse_test': 0.009458482337111159, 'mae_test': 0.07616002245244172, 'best_alpha': 15.492746373186593}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7717934932717591, 'mse_test': 0.010568663548047232, 'mae_test': 0.08075788638418993, 'best_alpha': 21.48974487243622}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7813269318675663, 'mse_test': 0.009791156533035207, 'mae_test': 0.0768372464285864, 'best_alpha': 20.98999499749875}, {'fileid': 'Max', 'pos': '

{'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8074538254597505, 'mse_test': 0.010044579575540587, 'mae_test': 0.07852623265230291, 'best_alpha': 34.982991495747875}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.803753675067374, 'mse_test': 0.009110624166623494, 'mae_test': 0.07551915880296824, 'best_alpha': 14.493246623311656}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7934744782258762, 'mse_test': 0.009528167383050078, 'mae_test': 0.07770721315793794, 'best_alpha': 7.496748374187094}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7550012643078968, 'mse_test': 0.010594158572367656, 'mae_test': 0.08134283969622991, 'best_alpha': 21.989494747373687}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7835943300651922, 'mse_test': 0.009637726080242989, 'mae_test': 0.07616963216473534, 'best_alpha': 14.493246623311656}, {'fileid': 'Max', 'pos': 'all', 'features':

{'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7385960895471142, 'mse_test': 0.011452102918522552, 'mae_test': 0.08271961459456581, 'best_alpha': 20.49024512256128}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8053475382560645, 'mse_test': 0.00902232868113344, 'mae_test': 0.07602426764738367, 'best_alpha': 16.991995997999}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7937581627995675, 'mse_test': 0.009593838507420438, 'mae_test': 0.07568783550255176, 'best_alpha': 23.488744372186094}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7720875355953575, 'mse_test': 0.00994771959876684, 'mae_test': 0.07979066412936084, 'best_alpha': 21.989494747373687}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7812840823968117, 'mse_test': 0.009847784774968628, 'mae_test': 0.079263761296578, 'best_alpha': 20.98999499749875}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer

{'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.826692259463563, 'mse_test': 0.009205990145932211, 'mae_test': 0.07605491432920217, 'best_alpha': 24.48824412206103}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8028647849010137, 'mse_test': 0.007990203872657088, 'mae_test': 0.06973090497459683, 'best_alpha': 19.990495247623812}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7912377328159735, 'mse_test': 0.009640531384808181, 'mae_test': 0.07349235589450052, 'best_alpha': 24.9879939969985}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8252901297940083, 'mse_test': 0.009218169015667356, 'mae_test': 0.07673652481470981, 'best_alpha': 25.987493746873437}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7283680112246813, 'mse_test': 0.011526092960394719, 'mae_test': 0.08065150663056579, 'best_alpha': 21.48974487243622}, {'fileid': 'Max', 'pos': 'all', 'features': ('

{'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7922731080915935, 'mse_test': 0.010603465618419794, 'mae_test': 0.07660009865901225, 'best_alpha': 23.488744372186094}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.6200322673760792, 'mse_test': 0.014203234987633364, 'mae_test': 0.09204046348264923, 'best_alpha': 11.494747373686844}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.841659689397819, 'mse_test': 0.008669196603496824, 'mae_test': 0.07136196035186365, 'best_alpha': 23.488744372186094}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7369225070869955, 'mse_test': 0.01200439554322573, 'mae_test': 0.08071930350885055, 'best_alpha': 22.489244622311155}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.7293054600096954, 'mse_test': 0.013022767569374628, 'mae_test': 0.08187722718462946, 'best_alpha': 14.493246623311656}, {'fileid': 'Max', 'pos': 'all', 'features':

In [266]:
len(gcPBM_varying_sample_size)

20

In [267]:
t = []
for res, sam in gcPBM_varying_sample_size:
    for d in res['outer_results']:
        d['sample_size'] = sam
        t.append(d)

In [269]:
pd.DataFrame(t).to_csv("/SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS/gcPBM/Max_complexity_varying_sampleSize.csv",
                      index=False)

In [175]:
model_dir = f"/SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS/gcPBM"

In [176]:
for record in ['Max']:
    print(f"Running for file: {record} \n")
    
    
    for fset in features:
        print(f"current feature is: {fset}...\n")        
        # load data
        X, y = encode_data(fileprefix=record, features=fset)
        
        # model
        res = nested_ridge_cv(X = X, y = y, 
                              fileid = record, 
                              pos = "all", 
                              features = fset, 
                              outer_cv_folds=10, 
                              inner_cv_folds=5)
        
        print(res)
        
        # Save the trained model
        f = '1mer_Flex' if "DNaseI" in fset else '1mer'
        
        model_filename = os.path.join(model_dir, f"{record}_{f}_model.pkl")
        joblib.dump(res['model'], model_filename)
        print(f"Model saved to {model_filename}")
        
        results_df_f_combined.append(res)

Running for file: Max 

current feature is: ('1mer',)...

1mer data loaded from /SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS/MAX_gcPBM_dir/Max_OHE_1mer.tsv... 

{'model': Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('ridgecv',
                 RidgeCV(alphas=[8.946023011505753],
                         cv=KFold(n_splits=5, random_state=42, shuffle=True),
                         scoring='r2'))]), 'best_alpha': 8.946023011505753, 'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8384589390895953, 'mse_test': 0.005793603726410747, 'mae_test': 0.060888437630910563, 'best_alpha': 16.991995997999}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8007202962864497, 'mse_test': 0.009900192939892293, 'mae_test': 0.07551453801992816, 'best_alpha': 9.995497748874438}, {'fileid': 'Max', 'pos': 'all', 'features': ('1mer',), 'r2_test': 0.8189166181511722, 'mse_test': 0.007444890829781595,

#### Prediction on the 'Max' SELEX-Seq data by the model build on gcPBM data

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def encode_selex_data(fileprefix, 
                      data_dir="/SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS", 
                      features=("1mer",), 
                      w=0):
    try:
        # Load 1mer data
        ohe_file = os.path.join(data_dir, "gcPBM", '1_mer', f"{fileprefix}_1mer.tsv")
        ohe_df = pd.read_csv(ohe_file, header=None, sep="\t")
        
        ohe_data = {row[0]: {'aff': float(row[0].split('__')[1]), 
                             'feat': row.iloc[1:].tolist()} for _, row in ohe_df.iterrows()}
        
        print(f"1mer data loaded from {ohe_file}... \n")
    
    except FileNotFoundError as fe:
        raise FileNotFoundError(f"File {ohe_file} not found: {fe}")

    combined_data = ohe_data.copy()

    # Load additional features if specified
    if len(features) > 1:
        for feature in features[1:]:
            print(f"Will load the additional feature {feature}")
            try:
                flex_feat_file = os.path.join(data_dir, "gcPBM",'DNAFlex', f"{fileprefix}_w{w}nt_{feature}.tsv")
                flex_df = pd.read_csv(flex_feat_file, sep="\t", header=None)

                for ids in ohe_data.keys():
                    flex_df_row = flex_df[flex_df[0] == ids]
                    if not flex_df_row.empty:
                        flex_df_feat = np.array(flex_df_row.iloc[0, 1:].tolist()) # first column is id

                        # Scale the flex features
                        min_value = np.min(flex_df_feat)
                        std_value = np.std(flex_df_feat)
                        
                        if std_value != 0:
                            scaled_flex_feat = (flex_df_feat - min_value) / std_value
                        else:
                            scaled_flex_feat = flex_df_feat  # If std is 0, scaling is not possible

                        # Calculate interaction terms for this feature using NumPy
                        interaction_terms = scaled_flex_feat[:-1] * scaled_flex_feat[1:]
                        combined_data[ids]['feat'].extend(scaled_flex_feat.tolist() + interaction_terms.tolist())
                
                print(f"{feature} data loaded from {flex_feat_file}... \n")
                
            except FileNotFoundError as fe:
                raise FileNotFoundError(f"File {flex_feat_file} not found: {fe}")
            except Exception as e:
                raise Exception(f"Error in extending feature {feature} from {flex_feat_file}: {e}")
    
    # Define X, y
    X = np.array([combined_data[x]['feat'] for x in combined_data.keys()])
    y = np.array([combined_data[x]['aff'] for x in combined_data.keys()])

    
    return X, y
    

#### Just to check perform the regression by training and testing on MAX SelexSeq dataset 

In [None]:

for fset in features:
    print(f"Running prediction for file: {record} with feature set: {fset}\n")

#     load new data from MAX selex sequencesbH
    X_new, y_new = encode_selex_data(fileprefix= "Max_SELEXSeq_normalized_10mers", features=fset)
    
    f = '1mer_Flex' if "DNaseI" in fset else '1mer'    
    res = nested_ridge_cv(X = X_new, y = y_new, 
                              fileid = record, 
                              pos = "all", 
                              features = fset, 
                              outer_cv_folds=10, 
                              inner_cv_folds=5)
    
    
#     r2 = r2_score(y_new, predictions)
    print(res)

#### Finally, load the gcPBM models and check the spearmann corr on the predicted ranks using MAX SELEX-Seq dataset

In [182]:

for fset in features:
    print(f"Running prediction for file: {record} with feature set: {fset}\n")

    X_new, y_new = encode_selex_data(fileprefix= "Max_SELEXSeq_normalized_10mers", 
                                     features=fset)
    
    f = '1mer_Flex' if "DNaseI" in fset else '1mer'

#     load models from Max gcPBM trained
    model_filename = os.path.join(model_dir, f"Max_{f}_model.pkl")
    loaded_model = joblib.load(model_filename)
    
#     loaded_model.predict()
    predictions = loaded_model.predict(X_new)
    spearman_corr, _ = spearmanr(predictions, y_new)
    
    spearman_results.append((fset, spearman_corr, r2))
    print(f"Spearman rank correlation for {record} with feature {fset}: {spearman_corr}")

    # Calculate ranks
    actual_ranks = rankdata(y_new, method='average')
    predicted_ranks = rankdata(predictions, method='average')
    
    
    results_df = pd.DataFrame({
            'Actual': y_new,
            'Predicted': predictions,
            'Actual_Rank': actual_ranks,
            'Predicted_Rank': predicted_ranks
        })
    
    results_filename = os.path.join(model_dir, f"{record}_{f}_predictions.csv")
    results_df.to_csv(results_filename, index=False)
    print(f"Predicted and actual values saved to {results_filename}")

    print(res)

Running prediction for file: Max with feature set: ('1mer',)

1mer data loaded from /SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS/gcPBM/1_mer/Max_SELEXSeq_normalized_10mers_1mer.tsv... 

Spearman rank correlation for Max with feature ('1mer',): 0.5551669263667407
Predicted and actual values saved to /SynologyNAS/001_Thesis_work/Prj_TF_FAMILY_SPECIFIC/RESULTS/gcPBM/Max_1mer_predictions.csv
{'model': Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('ridgecv',
                 RidgeCV(alphas=[3.3488244122061035],
                         cv=KFold(n_splits=5, random_state=42, shuffle=True),
                         scoring='r2'))]), 'best_alpha': 3.3488244122061035, 'outer_results': [{'fileid': 'Max', 'pos': 'all', 'features': ('1mer', 'DNaseI', 'twistDisp', 'NPP', 'stiffness', 'trxDi'), 'r2_test': 0.9005798873503494, 'mse_test': 0.00356566146019468, 'mae_test': 0.038527166430327844, 'best_alpha': 1.9994997498749374}, {'fileid': 'Max', 'po