In [None]:
import os, csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
from scipy import stats
from itertools import combinations

from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

destination = '/rsrch1/ip/rmuthusivarajan/imaging/ki67automation/ki67_training_cases'

dataFile = os.path.join(destination, 'rf_model', 'datamatrixB.29April2020.MCSF.modeCSFWholeBrain.csv')

data = pd.read_csv(dataFile, sep = ",")
df = pd.DataFrame(data)
df = df.fillna(0)

data = df[['ADC']]
target = df['CD_R']

new_list = []
r_list = []

for rs in range(100):
    # Lists to store evaluation metrics for each fold
    all_target_pred=[]
    all_target_test=[]
    all_data_test = []

    all_rsquared = []
    all_slope = []
    all_intercept = []
    all_r2score = []
    all_adjusted_r2 = []
    all_r2 = []
    
    # Initialize GroupShuffleSplit 
    #gss = GroupKFold(n_splits=5)
    gss = GroupKFold(n_splits=5, shuffle=True, random_state=rs)

    # Iterate over the splits
    for train_idx, test_idx in gss.split(df, groups=df['InstanceUID']):
        data_train, data_test = data.iloc[train_idx], data.iloc[test_idx]
        target_train, target_test = target.iloc[train_idx], target.iloc[test_idx]

        # Initialize and train the Random Forest Regressor
        model = RandomForestRegressor(n_estimators=50, max_depth = 10)
        model.fit(data_train, target_train)
        
        #Evaluate
        target_pred = model.predict(data_test)

        data_test = data_test.values.flatten().tolist()
        target_test = target_test.values.flatten().tolist()
        target_pred = target_pred.flatten().tolist()
        
        all_target_pred.extend(target_pred)
        all_target_test.extend(target_test) 
        all_data_test.extend(data_test)
        
        r2score = round(r2_score(target_test, target_pred),2)
        all_r2score.append(r2score)

        sample_size = len(df['CD_R'])
        adjusted_r2 = round(1 - (1-r2score)*((sample_size-1)/(sample_size-1-1)),2)
        all_adjusted_r2.append(adjusted_r2)

        slope, intercept, r_value, p_value, std_err = stats.linregress(target_test, target_pred)
        slope = round(slope,2)
        all_slope.append(slope)

        intercept = round(intercept,2)
        all_intercept.append(intercept)

    mean_slope = round(sum(all_slope)/5, 2)
    mean_intercept = round(sum(all_intercept)/5, 2)
    mean_r2score = round(sum(all_r2score)/5, 2)
    mean_adjusted_r2 = round(sum(all_adjusted_r2)/5, 2)
    
    r_list = [rs] + [all_adjusted_r2] + [mean_r2score] + [mean_adjusted_r2] + [mean_slope] + [mean_intercept]
    new_list.append(r_list)

columns=['random_state', 'adjusted_r2', 'mean_r2', 'mean_adjusted_r2', 'mean_slope', 'mean_intercept']
new_df = pd.DataFrame(new_list, columns=columns) 
new_df.to_csv('adc_RF_random_state_100.csv', index=False)