In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import math, os, sys, math
import pandas as pd
import numpy as np
import seaborn as sns
import textwrap
from scipy import stats
import re

code_dir = '/'.join(os.getcwd().split('/')[:-1])
main_code = os.path.join(code_dir, 'main_code')

sys.path.append(main_code)
import locations


In [None]:
# Plotting methods

def plot_distributions(start, num, df, save_path = None, disp = False):
    total_row = math.ceil(num/3)
    fig_length = total_row * 1.25
    fig = plt.figure(figsize = (4, fig_length),facecolor = 'white', dpi = 300)
    
    for i in range(num):
        row = i + start
        ax = fig.add_subplot(total_row, 3, i+1)

        y_neg = (df.loc[:,df.iloc[-1,:] == -1]).iloc[row, :]
        y_norm = (df.loc[:,df.iloc[-1,:] == 0]).iloc[row, :]
        y_large = (df.loc[:,df.iloc[-1,:] == 1]).iloc[row, :]
        
        mean_series = pd.concat([y_neg, y_norm, y_large])
        num_size = len(str(int(np.mean(mean_series))))
    
        if num_size > 4:
            div = 10**num_size
            y_neg = y_neg/div
            y_norm = y_norm/div
            y_large = y_large/div
        for  data, color, lab in zip([y_neg, y_norm, y_large], ['red', 'grey', 'blue'], ['Small', 'Normal', 'Large']):
            ax = sns.kdeplot(data, color = color, alpha = 0.4, shade = True, legend = lab, linewidth = 0.3)
            ax.set_title('test')
            ax.set(xlabel = None)
            ax.set(ylabel = None)
            
            title = df.index[row].replace('_', ' ').title()
            t_l = title.split(' ')[4:]
            title = ' '.join(t_l)
            title = textwrap.fill(title, 20)

            #fig.suptitle(title, ha = 'center', size = 20, weight = 'bold')
            #fig.tight_layout(rect=[0, 0.03, 1, 0.90])
            font_dict = {'ha':'center', 'fontsize':5, 'fontweight':'normal'}
            ax.axes.yaxis.set_visible(False)
            ax.set_title(title, fontdict = font_dict)


            for label in (ax.get_xticklabels() + ax.get_yticklabels()):
                label.set_fontsize(5)
    
    custom_lines = [Line2D([0], [0], color='red', lw=4),
                    Line2D([0], [0], color='grey', lw=4),
                    Line2D([0], [0], color='blue', lw=4)]
    
    fig_title = ' '.join(df.index[row].replace('_', ' ').title().split(' ')[:4])
    fig.suptitle(fig_title, va = 'top', fontsize = 15)
    fig.legend(custom_lines, ['Small', 'Normal', 'Large'], loc = 'upper right',  fontsize = 3 )#, bbox_to_anchor=(0.48, 0.93))
    plt.tight_layout(rect=[0, 0, 0.95, 0.94])
    
    if save_path is not None:
        out_file = os.path.join(save_path, fig_title.replace(' ', '_') + '.png')
        plt.savefig(out_file, format = 'png', dpi = 300, bbox_inches = 'tight')
    if save_path is None or disp:
        plt.show()
    plt.close()
    
def plot_all_dist(df, save_path = None, disp = False):
    lens = [0, 0, 0, 0]
    for enum, name in enumerate(df.index):
        if '_shape_' in name:
            lens[0]+=1
        elif '_firstorder_' in name:
            lens[1]+=1
        elif '_glcm_' in name:
            lens[2]+=1
        elif '_glrlm_' in name:
            lens[3]+=1
    start = 0
    for l in lens:
        if l == 0:
            continue
        plot_distributions(start, l ,df, save_path, disp)
        start+=l

In [None]:
# other methods

def summary_df(df, save_name = None):
    summary_dict = {'Mean':df.mean(axis=1), 'Std_Dev':df.std(axis=1),'Min':df.min(axis=1),
                    'Max':df.max(axis=1)}
    
    summary_df = pd.DataFrame.from_dict(summary_dict)
    if save_name is not None:
        summary_df.to_excel(save_name, index = True, header = True, engine = 'openpyxl')

    return summary_df

In [None]:
locations.get_locations('result_dir')

In [None]:
# Setup directories and Load DataFrame

storage_dir = locations.get_locations('result_dir')
save_date = locations.get_locations('save_date')
csv_file = f'{save_date}_home_results.csv'
data_loc = os.path.join(storage_dir, csv_file)
save_dir = locations.get_locations('save_dir')

data = pd.read_csv(data_loc, sep=',', header = 0, index_col = 0)

In [None]:
phrases = ['Deep_grey', 'Brain_stem', 'Left_wm', 'Left_gm', 
           'Right_wm', 'Right_gm', 'Left_cerebellum', 'Right_cerebellum']

def setup_data(data):
    row_nums = []
    dfs = []

    # seperate df's into structure specific dfs
    for p in phrases:
        ind_list = []
        for enum, n in enumerate(data.index):
            if p in n:
                ind_list.append(enum)
        row_nums.append((p, ind_list))
        df = data.iloc[ind_list,:]

        dfs.append(df)
        
    return dfs
        
dfs = setup_data(data)

In [None]:
# Show summary of df's - mean, std-dev, min and max
out_dir = locations.get_locations('excel_results')
out_xcell = os.path.join(out_dir, 'cleaned_summary_statistics.xlsx')
with pd.ExcelWriter(out_xcell) as writer:
    for d, p in zip(dfs, phrases):
        for i, name in zip([-1, 0, 1], ['Small', 'Normal', 'Large']):
            #print("{}:".format(name))
            sheet_name = p + '_' + name
            #print(sheet_name)
            res = summary_df((d.loc[:,d.iloc[-1,:] == i]).iloc[:-2, :])
            res.to_excel(writer, sheet_name = sheet_name, index = True)
            

In [None]:
def clean_title(title, wrap = True):
    title = title.replace('\n', '')
    s_l = title.split('_')
    ind = 0
    for enum, s in enumerate(s_l):
        if s == 'original':
            ind = enum + 1
            break
            
    if s_l[ind] == 'firstorder':
        s_l[ind] = 'FO'
    if wrap:
        return textwrap.fill(' '.join(s_l[ind:]), 20)
    else:
        return ' '.join(s_l[ind:])

df_all = dfs[0].copy().reset_index().iloc[:-2, :]
df_all = df_all.drop('index', axis = 1)
clean_ind = dfs[0].index.tolist()[:-2]
clean_ind = [clean_title(x) for x in clean_ind]

for df in dfs[1:]:
    df = df.copy().reset_index().iloc[:-2,:]
    df = df.drop('index', axis = 1)
    df_all = df_all.add(df)

df_all.index = pd.Series(clean_ind)
save_path = os.path.join(out_dir, 'all_summary.xlsx')
summary_df(df_all, save_name = save_path)

In [None]:
dist_folder = os.path.join(storage_dir , 'distribution_figures')
os.makedirs(dist_folder, exist_ok = True)

raw_dist_folder = os.path.join(dist_folder, 'raw_data')
os.makedirs(raw_dist_folder, exist_ok = True)
plot_all_dist(dfs[0], raw_dist_folder, disp = True)

# Data Cleaning
### Remove features that are basically size

In [None]:
del_names = []
del_inds = []

keep_names = []
keep_inds = []

for df in dfs:
    small = df.loc[:,data.iloc[-1,:] == -1].iloc[:,1:100]
    norm = df.loc[:,data.iloc[-1,:] == 0].iloc[:,1:100]
    large = df.loc[:,data.iloc[-1,:] == 1].iloc[:,1:100]
    for enum, i in enumerate(df.index[:-2]):
        i_small = small.loc[i,:]
        i_norm = norm.loc[i,:]
        i_large = large.loc[i,:]
        res = stats.f_oneway(i_small, i_norm, i_large)
        f_val, p_val = stats.f_oneway(i_small, i_norm, i_large)
        p = re.compile('original', re.I)
        m = p.search(i)
        if p_val < 5e-15:
            #print("{:>6.2f}\t{:>5.2f}: {}".format(f_val, p_val, i))    
            del_names.append(i[m.end()+1:].strip())
            del_inds.append(enum)
        else:
            keep_names.append(i[m.end()+1:].strip())
            keep_inds.append(enum)

del_names = list(set(del_names))
del_inds = list(set(del_inds))
keep_names = list(set(keep_names))
keep_inds = list(set(keep_inds))

print("Del Names:{}\nDel Inds:{}\nKeep Names:{}\nKeep Inds: {}".format(len(del_names),len(del_inds),len(keep_names),len(keep_inds)))
            
            
data_rem_names = []
data_keep_names = []
for i in data.index:
    p = re.compile('original', re.I)
    m = p.search(i)
    try:
        i_short = i[m.end()+1:].strip()
    except:
        continue
    if i_short in del_names:
        data_rem_names.append(i)
    else:
        data_keep_names.append(i)

        
        
# Drop features and remake cleaned dfs
# then plot figures to see that we're happy. 

clean_data = data.drop(data_rem_names, axis = 0)
clean_dfs = setup_data(clean_data)

dist_folder = os.path.join(storage_dir , 'distribution_figures')
clean_dist_folder = os.path.join(dist_folder, 'cleaned_data')
os.makedirs(clean_dist_folder, exist_ok = True)

plot_all_dist(clean_dfs[0], clean_dist_folder, disp = True)

print("Following have been dropped")

dirty_data = data.drop(data_keep_names, axis = 0)
dirty_dfs = setup_data(dirty_data)

dirty_dist_folder = os.path.join(dist_folder, 'dirty_data')
os.makedirs(dirty_dist_folder, exist_ok = True)

plot_all_dist(dirty_dfs[0], dirty_dist_folder, disp = True)

save_date = locations.get_locations('save_date')
clean_save_name = f'{save_date}_clean_data.csv'
dirty_save_name = f'{save_date}_dirty_data.csv'

save_loc_clean = os.path.join(storage_dir, clean_save_name)
clean_data.to_csv(save_loc_clean, sep = ',', header = True, index=True)

save_loc_dity = os.path.join(storage_dir, dirty_save_name)
dirty_data.to_csv(save_loc_dity, sep = ',', header = True, index=True)