In [1]:
import pandas as pd
import os

# Function to find the row with the best combination of RMSE and R^2 values for each model
def find_best_row(file_paths, number_of_features):
    best_rows = {}

    for file_path in file_paths:
        excel_data = pd.ExcelFile(file_path)
        file_name = os.path.basename(file_path)

        for sheet_name in excel_data.sheet_names:
            if sheet_name in ["MLR", "DT", "RF", "KNN", "SVM"]:
                df = pd.read_excel(excel_data, sheet_name=sheet_name)

                # Filter rows based on the condition: "Number of features" < number_of_features
                df_filtered = df[df['Number of features'] < number_of_features]

                # Find the best row index after filtering
                best_row_index = (
                    df_filtered['Test RMSE'].idxmin(),
                    df_filtered['Test data R^2 score'].idxmax()
                )

                best_rmse = df.loc[best_row_index[0], 'Test RMSE']
                best_r2 = df.loc[best_row_index[1], 'Test data R^2 score']

                if sheet_name not in best_rows:
                    best_rows[sheet_name] = {
                        'File': file_name,
                        'Best Row Index': best_row_index,
                        'Best RMSE': best_rmse,
                        'Best R^2': best_r2
                    }
                else:
                    curr_rmse = best_rows[sheet_name]['Best RMSE']
                    curr_r2 = best_rows[sheet_name]['Best R^2']

                    if best_rmse < curr_rmse and best_r2 > curr_r2:
                        best_rows[sheet_name] = {
                            'File': file_name,
                            'Best Row Index': best_row_index,
                            'Best RMSE': best_rmse,
                            'Best R^2': best_r2
                        }

    return best_rows


In [None]:
# A549 best

In [7]:
file_paths = ['../Data/Quality_A549_15_.xlsx', 
              '../Data/Quality_A549_28_.xlsx', 
              '../Data/Quality_A549_42_.xlsx']

best_rows = find_best_row(file_paths, 6)
# Displaying the best rows for each model across files
for model, row_info in best_rows.items():
    file_name = row_info['File']
    row_index = row_info['Best Row Index']
    print(f"Model: {model}")
    print(f"File: {file_name}")
    print("Best Row:")
    print(pd.read_excel('../Data/'+file_name, sheet_name=model).iloc[row_index[0]])
    print()

Model: MLR
File: Quality_A549_15_.xlsx
Best Row:
Unnamed: 0                 20.000000
Correlation threshold       0.530000
Training data R^2 score     0.428089
Test data R^2 score         0.626633
Training RMSE               0.705997
Test RMSE                   0.633406
Number of features          3.000000
Name: 20, dtype: float64

Model: DT
File: Quality_A549_15_.xlsx
Best Row:
Unnamed: 0                 478.000000
Correlation threshold        0.500000
Training data R^2 score      0.846633
Test data R^2 score          0.715311
Training RMSE                0.365598
Test RMSE                    0.553094
Number of features           5.000000
Depth number                 4.000000
Name: 478, dtype: float64

Model: RF
File: Quality_A549_15_.xlsx
Best Row:
Unnamed: 0                 338.000000
Correlation threshold        0.500000
Training data R^2 score      0.922737
Test data R^2 score          0.743151
Training RMSE                0.259492
Test RMSE                    0.525356
Number of f

In [None]:
# BALB3_T3 best

In [12]:
file_paths = ['../Data/Quality_BALB_3T3_15_.xlsx', 
              '../Data/Quality_BALB_3T3_28_.xlsx', 
              '../Data/Quality_BALB_3T3_42_.xlsx']

best_rows = find_best_row(file_paths, 7)
# Displaying the best rows for each model across files
for model, row_info in best_rows.items():
    file_name = row_info['File']
    row_index = row_info['Best Row Index']
    print(f"Model: {model}")
    print(f"File: {file_name}")
    print("Best Row:")
    print(pd.read_excel('../Data/'+file_name, sheet_name=model).iloc[row_index[0]])
    print()

Model: MLR
File: Quality_BALB_3T3_15_.xlsx
Best Row:
Unnamed: 0                 18.000000
Correlation threshold       0.510000
Training data R^2 score     0.506715
Test data R^2 score         0.653417
Training RMSE               0.609479
Test RMSE                   0.541461
Number of features          6.000000
Name: 18, dtype: float64

Model: DT
File: Quality_BALB_3T3_42_.xlsx
Best Row:
Unnamed: 0                 505.000000
Correlation threshold        0.510000
Training data R^2 score      0.735175
Test data R^2 score          0.686190
Training RMSE                0.453392
Test RMSE                    0.446003
Number of features           6.000000
Depth number                 3.000000
Name: 505, dtype: float64

Model: RF
File: Quality_BALB_3T3_15_.xlsx
Best Row:
Unnamed: 0                 359.000000
Correlation threshold        0.510000
Training data R^2 score      0.932487
Test data R^2 score          0.747013
Training RMSE                0.225477
Test RMSE                    0.462607

In [None]:
# LoVo best

In [13]:
file_paths = ['../Data/Quality_LoVo_15_.xlsx', 
              '../Data/Quality_LoVo_28_.xlsx', 
              '../Data/Quality_LoVo_42_.xlsx']

best_rows = find_best_row(file_paths, 3)
# Displaying the best rows for each model across files
for model, row_info in best_rows.items():
    file_name = row_info['File']
    row_index = row_info['Best Row Index']
    print(f"Model: {model}")
    print(f"File: {file_name}")
    print("Best Row:")
    print(pd.read_excel('../Data/'+file_name, sheet_name=model).iloc[row_index[0]])
    print()

Model: MLR
File: Quality_LoVo_15_.xlsx
Best Row:
Unnamed: 0                 20.000000
Correlation threshold       0.530000
Training data R^2 score     0.439344
Test data R^2 score         0.526231
Training RMSE               0.648938
Test RMSE                   0.753451
Number of features          2.000000
Name: 20, dtype: float64

Model: DT
File: Quality_LoVo_28_.xlsx
Best Row:
Unnamed: 0                 562.000000
Correlation threshold        0.530000
Training data R^2 score      0.692501
Test data R^2 score          0.575884
Training RMSE                0.520470
Test RMSE                    0.436178
Number of features           2.000000
Depth number                 4.000000
Name: 562, dtype: float64

Model: RF
File: Quality_LoVo_28_.xlsx
Best Row:
Unnamed: 0                 396.000000
Correlation threshold        0.530000
Training data R^2 score      0.776384
Test data R^2 score          0.431995
Training RMSE                0.443838
Test RMSE                    0.504775
Number of f

In [None]:
# LoVo_Dx best

In [14]:
file_paths = ['../Data/Quality_LoVo_DX_15_.xlsx', 
              '../Data/Quality_LoVo_DX_28_.xlsx', 
              '../Data/Quality_LoVo_DX_42_.xlsx']

best_rows = find_best_row(file_paths, 6)
# Displaying the best rows for each model across files
for model, row_info in best_rows.items():
    file_name = row_info['File']
    row_index = row_info['Best Row Index']
    print(f"Model: {model}")
    print(f"File: {file_name}")
    print("Best Row:")
    print(pd.read_excel('../Data/'+file_name, sheet_name=model).iloc[row_index[0]])
    print()

Model: MLR
File: Quality_LoVo_DX_42_.xlsx
Best Row:
Unnamed: 0                 30.000000
Correlation threshold       0.630000
Training data R^2 score     0.516627
Test data R^2 score         0.527256
Training RMSE               0.668564
Test RMSE                   0.496370
Number of features          5.000000
Name: 30, dtype: float64

Model: DT
File: Quality_LoVo_DX_28_.xlsx
Best Row:
Unnamed: 0                 850.000000
Correlation threshold        0.630000
Training data R^2 score      0.999776
Test data R^2 score          0.710797
Training RMSE                0.014366
Test RMSE                    0.392539
Number of features           5.000000
Depth number                12.000000
Name: 850, dtype: float64

Model: RF
File: Quality_LoVo_DX_42_.xlsx
Best Row:
Unnamed: 0                 582.000000
Correlation threshold        0.630000
Training data R^2 score      0.923535
Test data R^2 score          0.624988
Training RMSE                0.265908
Test RMSE                    0.442094
Nu

In [None]:
# MCF-7 best

In [15]:
file_paths = ['../Data/Quality_MCF-7_15_.xlsx', 
              '../Data/Quality_MCF-7_28_.xlsx', 
              '../Data/Quality_MCF-7_42_.xlsx']

best_rows = find_best_row(file_paths, 5)
# Displaying the best rows for each model across files
for model, row_info in best_rows.items():
    file_name = row_info['File']
    row_index = row_info['Best Row Index']
    print(f"Model: {model}")
    print(f"File: {file_name}")
    print("Best Row:")
    print(pd.read_excel('../Data/'+file_name, sheet_name=model).iloc[row_index[0]])
    print()

Model: MLR
File: Quality_MCF-7_15_.xlsx
Best Row:
Unnamed: 0                 17.000000
Correlation threshold       0.500000
Training data R^2 score     0.490738
Test data R^2 score         0.635529
Training RMSE               0.663951
Test RMSE                   0.650522
Number of features          4.000000
Name: 17, dtype: float64

Model: DT
File: Quality_MCF-7_15_.xlsx
Best Row:
Unnamed: 0                 476.000000
Correlation threshold        0.500000
Training data R^2 score      0.633516
Test data R^2 score          0.586802
Training RMSE                0.563239
Test RMSE                    0.692643
Number of features           4.000000
Depth number                 2.000000
Name: 476, dtype: float64

Model: RF
File: Quality_MCF-7_15_.xlsx
Best Row:
Unnamed: 0                 324.000000
Correlation threshold        0.500000
Training data R^2 score      0.819379
Test data R^2 score          0.614284
Training RMSE                0.395412
Test RMSE                    0.669213
Number o