In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from itertools import combinations_with_replacement, combinations

rng = np.random.default_rng(50)
data_df = pd.read_csv('R_IP_vs_Parameters.csv')

new_com_iter = combinations_with_replacement(data_df.drop('RIP',axis=1).columns, 2)
new_com = list(new_com_iter)

In [2]:
def add_cross(data:pd.DataFrame, col1:str, col2:str):
    new_col = col1 + "*" + col2
    data_new = data.copy()
    data_new[new_col] = data[col1] * data[col2]
    return data_new

In [3]:
def loo_cv(data:pd.DataFrame, feature_idx:int, feature_num:int, x_data:np.ndarray, y_data:np.ndarray):
    loo = LeaveOneOut()

    y_true, y_pred = [], []
    for train_idx, test_idx in loo.split(x_data):
            X_train, X_test = x_data[train_idx], x_data[test_idx]
            y_train, y_test = y_data[train_idx], y_data[test_idx]
            
            model = LinearRegression()
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            
            y_true.append(y_test[0])
            y_pred.append(pred[0])
        
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    q2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    names = data.columns[1:].to_list()
    feature_idxs = list(feature_idx)
    feature_names = [names[n] for n in feature_idxs]
    
    result = {
        'features': feature_names,
        'num_features': feature_num,
        'Q2': q2,
        'RMSE': rmse,
        'MAE': mae
    }

    return result

In [4]:
def estimate_mlr(data:pd.DataFrame):
    X = np.array(data.drop(['RIP'], axis=1))
    Y = np.array(data['RIP'])
    n_features = X.shape[1]
    loo_results = []
    for k in range(1, n_features + 1):
        for feature_idx in combinations(range(n_features), k):
            X_subset = X[:, feature_idx]
            loo_result = loo_cv(data, feature_idx, k, X_subset, Y)
            loo_results.append(loo_result)
            df_loo_results = pd.DataFrame(loo_results)
    return df_loo_results

In [None]:
output_file = "results_all.csv"
header_mode = 'w'  # 首次写入使用写入模式

for i, com in enumerate(new_com):
    df_1cross = add_cross(data_df, com[0], com[1])
    X = np.array(df_1cross.drop(['RIP'], axis=1))
    Y = np.array(df_1cross['RIP'])
    n_features = X.shape[1]
    
    df_results = estimate_mlr(df_1cross)
    df_results.insert(0, 'Corss 1', com[0])
    df_results.insert(1, 'Cross 2', com[1])
    
    df_results.to_csv(
        output_file,
        mode=header_mode,          
        header=(i == 0),           
        index=False                
    )
    header_mode = 'a'
    
    print(f"Processed pair ({com[0]}, {com[1]}), {i+1}/{len(new_com)}")

In [None]:
output_results = pd.read_csv('results_all.csv')
output_results.sort_values(by='Q2', ascending=False).head(10)

Unnamed: 0,Corss 1,Cross 2,features,num_features,Q2,RMSE,MAE
11734,ESOMO,BV(r=3),"['ESOMO', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III', 'R...",8,0.932715,2.462391,1.906133
109805,BV(r=3),Red.,"['ESOMO', 'ω', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III...",8,0.932478,2.466729,1.855935
11954,ESOMO,BV(r=3),"['BV(r=3)', 'C-H BDE', 'ΔGCu-III', 'Charge', '...",8,0.932334,2.469357,1.881119
12019,ESOMO,BV(r=3),"['ESOMO', 'ω', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III...",9,0.932305,2.469883,1.919938
12119,ESOMO,BV(r=3),"['ESOMO', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III', 'C...",9,0.93172,2.480526,1.889434
147153,C-H BDE,Oxi.,"['ESOMO', 'ω', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III...",9,0.931686,2.481155,1.865728
110295,BV(r=3),Red.,"['ESOMO', 'ω', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III...",9,0.931545,2.483701,1.870747
110014,BV(r=3),Red.,"['ESOMO', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III', 'R...",8,0.931447,2.485483,1.879782
110299,BV(r=3),Red.,"['ESOMO', 'ω', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III...",9,0.931234,2.489349,1.867687
109265,BV(r=3),Red.,"['ESOMO', 'BV(r=3)', 'C-H BDE', 'ΔGCu-III', 'V...",7,0.931215,2.48969,1.885125
