In [3]:
import os, csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
from scipy import stats
from itertools import combinations

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import mean_squared_error, r2_score

destination = ''

dataFile = os.path.join(destination, )

data = pd.read_csv(dataFile, sep = ",")
df = pd.DataFrame(data)
df = df.fillna(0)

input_list = ['T1', 'T2', 'TC', 'FL']

for r in range(1, len(input_list) + 1):
    for combination in combinations(input_list, r):
        input_mod = list(combination)
        #input_mod.extend(['InstanceUID', 'CD'])
        #print(input_mod)
        L = len(input_mod)
        #print(L)
        #print(input_mod)
        data = df[input_mod]
        target = df['CD_R']
        #print(data.head())
        #print(len(df))

        new_list = []
        r_list = []

        for rs in range(100):
            #print(rs)

            # Lists to store evaluation metrics for each fold
            all_pred=[]
            all_test=[]
            all_rsquared = []
            all_slope = []
            all_intercept = []
            all_r2score = []
            all_adjusted_r2 = []
            all_r2 = []
            
            # Initialize GroupShuffleSplit with 3 splits
            gss = GroupKFold(n_splits=5, shuffle=True, random_state=rs)
            
            # Iterate over the splits
            for train_idx, test_idx in gss.split(df, groups=df['InstanceUID']):
                data_train, data_test = data.iloc[train_idx], data.iloc[test_idx]
                target_train, target_test = target.iloc[train_idx], target.iloc[test_idx]
                #print(train_idx)
                #print(test_idx)

                # Initialize and train the Linear regression
                model = LinearRegression()            
                model.fit(data_train, target_train)

                #Evaluate
                target_pred = model.predict(data_test)
                
                r2score = round(r2_score(target_test, target_pred),2)
                #print(f"R2score: {r2score}")
                all_r2score.append(r2score)
                
                all_pred.extend(target_pred)
                all_test.extend(target_test)
                
                # calculate the adjusted R2 score, y = sample size, x = number of independent variables
                sample_size = len(target_test)
                adjusted_r2 = round(1 - (1-r2score)*((sample_size-1)/(sample_size-L-1)),2)
                all_adjusted_r2.append(adjusted_r2)

                slope, intercept, r_value, p_value, std_err = stats.linregress(target_test, target_pred)
                r_squared = round(r_value**2,2)
                slope = round(slope,2)
                intercept = round(intercept,2)
    
                all_slope.append(slope)
                all_intercept.append(intercept)
                all_rsquared.append(r_squared)
    
            mean_slope = round(sum(all_slope)/5, 2)
            mean_intercept = round(sum(all_intercept)/5, 2)
            mean_rsquared = round(sum(all_rsquared)/5, 2)
            mean_r2score = round(sum(all_r2score)/5, 2)
            mean_adjusted_r2 = round(sum(all_adjusted_r2)/5, 2)
    
            all_test = np.array(all_test)
            all_pred = np.array(all_pred)
            
            # Create the regression line
            slope, intercept, r_value, p_value, std_err = stats.linregress(all_test, all_pred)
            line = slope * all_test + intercept
            r2 = round((r_value**2),2)
            all_r2.append(r2)
            
            filename = "".join(input_mod)
            r_list = [filename] + [rs] + [all_rsquared] + [mean_rsquared] + [mean_slope] + [mean_intercept] 
            new_list.append(r_list)
        
        columns=['input', 'Random_state', '5fold_rsquared', 'mean_rsquared', 'mean_slope', 'mean_intercept']
        new_df = pd.DataFrame(new_list, columns=columns) 
        new_df.to_csv(filename + '_lrmodel_100_randomstate.csv', index=False)