In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Features include: mean, standard deviation, median, count of non-missing affinity values, mode, top 5 maximum values, and top 5 minimum values per row (calculated after removing NaNs), totaling 15 dimensions.
# The features on the KIBA dataset will have 3 fewer dimensions than those on the Davis dataset



In [7]:
import pandas as pd
import numpy as np
from scipy.stats import mode

# Assume file path
file_path = 'kiba_binding_affinity_matric.txt'  # Replace with actual file path

# Read file, using the first row as column names and the first column as index
df = pd.read_csv(file_path, sep="\t", index_col=0)

# Define a function to calculate statistical features
def calculate_stats(row):
    non_nan_values = row.dropna()  # Remove NaN values
    count = len(non_nan_values)  # Number of non-NaN values
    mean = non_nan_values.mean() if count > 0 else np.nan  # Mean
    std = non_nan_values.std() if count > 0 else np.nan  # Standard deviation
    median = non_nan_values.median() if count > 0 else np.nan  # Median
    # mode_value = mode(non_nan_values).mode[0] if count > 0 else np.nan  # Mode
    # Calculate mode, considering the possibility of scalar return
    if count > 0:
        mode_result = mode(non_nan_values)
        if isinstance(mode_result.mode, np.ndarray):  # If it’s an array type
            mode_value = mode_result.mode[0] if mode_result.mode.size > 0 else np.nan
        else:  # If it’s directly a scalar
            mode_value = mode_result.mode
    else:
        mode_value = np.nan
    
    top_5 = non_nan_values.nlargest(5).tolist() if count >= 5 else non_nan_values.tolist()  # Top 5 maximum values
    while len(top_5) < 5:
        top_5.append(np.nan)  # Fill with NaN if fewer than 5 values
    bottom_5 = non_nan_values.nsmallest(5).tolist() if count >= 5 else non_nan_values.tolist()  # Top 5 minimum values
    while len(bottom_5) < 5:
        bottom_5.append(np.nan)  # Fill with NaN if fewer than 5 values
    return [count, mean, std, median, mode_value] + top_5 + bottom_5

# Apply to each row
results = df.apply(calculate_stats, axis=1, result_type='expand')

# Set column names
columns = ['Count', 'Mean', 'Std', 'Median', 'Mode'] + [f'Top_{i+1}' for i in range(5)] + [f'Bottom_{i+1}' for i in range(5)]
results.columns = columns

# Add Drug ID column
results.insert(0, 'Drug_ID', df.index)

# Save to file
output_file_path = 'KIBA_Drug_feature.csv'
results.to_csv(output_file_path, index=False)

output_file_path


'KIBA_Drug_feature.csv'

In [8]:
import pandas as pd
import numpy as np
from scipy.stats import mode

# Assume file path
file_path = 'Tran_kiba_binding_affinity.txt'  # Replace with the actual file path

# Read file, using the first row as column names and the first column as index
df = pd.read_csv(file_path, sep="\t", index_col=0)

# Define a function to calculate statistical features
def calculate_stats(row):
    non_nan_values = row.dropna()  # Remove NaN values
    count = len(non_nan_values)  # Number of non-NaN values
    mean = non_nan_values.mean() if count > 0 else np.nan  # Mean
    std = non_nan_values.std() if count > 0 else np.nan  # Standard deviation
    median = non_nan_values.median() if count > 0 else np.nan  # Median
    # mode_value = mode(non_nan_values).mode[0] if count > 0 else np.nan  # Mode
    # Calculate mode, considering the possibility of scalar return
    if count > 0:
        mode_result = mode(non_nan_values)
        if isinstance(mode_result.mode, np.ndarray):  # If it’s an array type
            mode_value = mode_result.mode[0] if mode_result.mode.size > 0 else np.nan
        else:  # If it’s directly a scalar
            mode_value = mode_result.mode
    else:
        mode_value = np.nan
    
    top_5 = non_nan_values.nlargest(5).tolist() if count >= 5 else non_nan_values.tolist()  # Top 5 maximum values
    while len(top_5) < 5:
        top_5.append(np.nan)  # Fill with NaN if fewer than 5 values
    bottom_5 = non_nan_values.nsmallest(5).tolist() if count >= 5 else non_nan_values.tolist()  # Top 5 minimum values
    while len(bottom_5) < 5:
        bottom_5.append(np.nan)  # Fill with NaN if fewer than 5 values
    return [count, mean, std, median, mode_value] + top_5 + bottom_5

# Apply to each row
results = df.apply(calculate_stats, axis=1, result_type='expand')

# Set column names
columns = ['Count', 'Mean', 'Std', 'Median', 'Mode'] + [f'Top_{i+1}' for i in range(5)] + [f'Bottom_{i+1}' for i in range(5)]
results.columns = columns

# Add Protein ID column
results.insert(0, 'Protein_ID', df.index)

# Save to file
output_file_path = 'KIBA_Protein_feature.csv'
results.to_csv(output_file_path, index=False)

output_file_path


'KIBA_Protein_feature.csv'