In [1]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.externals import joblib

In [2]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
from mpl_toolkits.axes_grid1.inset_locator import mark_inset, zoomed_inset_axes
from matplotlib.lines import Line2D

## Load and clean data
* One data frame for all training and test data
* One data frame for healthy vs diseased comparison

In [3]:
train_and_test_dir = 'F:\High_Quality_All\\' 
healthy_vs_diseased_dir = 'F:\Healthy_VS_Diseased\\' 

train_test_file_paths = listdir(train_and_test_dir)
healthy_diseased_file_paths = listdir(healthy_vs_diseased_dir)

train_test_df = cu.combine_csvs(train_and_test_dir, train_test_file_paths)
train_test_df = train_test_df.drop(['\n'])

healthy_diseased_df = cu.combine_csvs(healthy_vs_diseased_dir, healthy_diseased_file_paths)
healthy_diseased_df = healthy_diseased_df.drop(['\n'])

train_test_col_names = train_test_df.columns.values.tolist()
original_healthy_diseased_col_names = healthy_diseased_df.columns.values.tolist()

In [4]:
mq.log2_normalize(train_test_df)

train_test_df_min = train_test_df.min().min()
train_test_impute_val = train_test_df_min/2
train_test_df = train_test_df.fillna(train_test_impute_val)


  df.iloc[:,:] = np.log2(df.iloc[:,:])


In [5]:
mq.log2_normalize(healthy_diseased_df)

healthy_diseased_df_min = healthy_diseased_df.min().min()
hd_impute_val = healthy_diseased_df_min/2
healthy_diseased_df = healthy_diseased_df.fillna(hd_impute_val)


  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Map each column to a corresponding label

In [6]:
## TODO: move to classification utils
def remove_prefix(col_names):
    new_names = []
    
    for name in col_names:
        if name.startswith('Diseased_'):
            new_names.append(name.replace('Diseased_', ''))
        elif name.startswith('Healthy_'):
            new_names.append(name.replace('Healthy_', ''))
        else:
            new_names.append(name)
            
    return new_names

In [7]:
# Remove "Healthy_" or "Diseased_" prefix from column names
stripped_col_names = remove_prefix(original_healthy_diseased_col_names)
healthy_diseased_df.columns = stripped_col_names

In [8]:
train_test_tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 
                      'Liver', 'Monocyte', 'Ovary', 'Pancreas', 
                      'Substantia_Nigra', 'Temporal_Lobe']
train_test_tissues_to_columns = cu.map_tissues_to_columns(train_test_df, train_test_tissues)
 
healthy_diseased_tissues = ['Blood_Plasma', 'Liver', 'Pancreas', 'Substantia_Nigra', 'CSF']
healthy_diseased_tissues_to_columns = cu.map_tissues_to_columns(healthy_diseased_df, healthy_diseased_tissues)

In [19]:
print(train_test_df.shape)
train_test_df = cu.filter_peptides_by_samples_and_tissues(train_test_df, min_samples=5, min_tissues=1, max_tissues=9, 
                                               tissues=train_test_tissues, imputed_val=train_test_impute_val)
print(train_test_df.shape)
mq.median_normalize(train_test_df)

print(healthy_diseased_df.shape)
healthy_diseased_df = cu.filter_peptides_by_samples_and_tissues(healthy_diseased_df, min_samples=5, min_tissues=1, max_tissues=9, 
                                               tissues=healthy_diseased_tissues, imputed_val=hd_impute_val)
print(healthy_diseased_df.shape)
mq.median_normalize(healthy_diseased_df)

(55676, 253)
(55676, 253)
(43434, 149)
(43434, 149)


In [20]:
train_test_column_names = train_test_df.columns.values.tolist()
train_test_labels = cu.get_labels(train_test_column_names, 
                                  train_test_tissues_to_columns)

healthy_diseased_column_names = healthy_diseased_df.columns.values.tolist()
healthy_diseased_labels = cu.get_labels(healthy_diseased_column_names, 
                                        healthy_diseased_tissues_to_columns)

## PCA

### Map columns to colors, and prepare PCA data frames

In [21]:
train_test_dir = r'D:\Images\Human_Tissues\\'
healthy_diseased_dir = r'D:\Images\Healthy_vs_Diseased\\'

train_test_column_to_color = mq.map_colors(train_test_tissues, 
                                           train_test_tissues_to_columns, 
                                           9)

healthy_diseased_column_to_color = mq.map_colors(healthy_diseased_tissues, 
                                                 healthy_diseased_tissues_to_columns, 
                                                 5)

In [22]:
train_test_T = train_test_df.T  #Transpose data frame

# For 2 dimensional PCA
pca = PCA() # create a PCA object
pca.fit(train_test_T) # do the math
pca_data = pca.transform(train_test_T) # get PCA coordinates for dataframe

# For 3 dimensional PCA
pca_3 = PCA(n_components=3) 
pca_3.fit(train_test_T)
pca_data_3 = pca_3.transform(train_test_T)

per_var, pca_labels = mq.make_scree_plot(pca, train_test_dir)
per_var_3, pca_labels_3 = mq.make_scree_plot(pca_3, train_test_dir, '3 Dimensional Scree')

In [23]:
healthy_diseased_T = healthy_diseased_df.T  #Transpose data frame

# For 2 dimensional PCA
healthy_diseased_pca = PCA() # create a PCA object
healthy_diseased_pca.fit(healthy_diseased_T) # do the math
healthy_diseased_pca_data = healthy_diseased_pca.transform(healthy_diseased_T) # get PCA coordinates for dataframe

healthy_diseased_per_var, healthy_diseased_pca_labels = mq.make_scree_plot(healthy_diseased_pca, 
                                                                           healthy_diseased_dir)

### 3D PCA of Train and Test

In [24]:
mq.draw_3d_pca(train_test_column_names, pca_data_3, train_test_dir, 
               train_test_column_to_color, per_var_3, pca_labels_3, 
               train_test_tissues, train_test_tissues_to_columns, '3D PCA')

### Plot train and test data, zoomed-in on tight clustering

In [30]:
tt_color_dict = train_test_column_to_color
train_test_pca_df = pd.DataFrame(pca_data, index = train_test_col_names, columns = pca_labels)

fig = plt.figure(1)

ax = fig.add_subplot(111)

plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))

for column in train_test_pca_df.index:
    plt.scatter(train_test_pca_df.PC1.loc[column], train_test_pca_df.PC2.loc[column], 
                color = tt_color_dict[column])

# Plot again in zoomed sub-plot
axins = zoomed_inset_axes(ax, 6, loc='center right') # ax, zoom-factor, location
for column in train_test_pca_df.index:
    axins.scatter(train_test_pca_df.PC1.loc[column], train_test_pca_df.PC2.loc[column], 
                  color = tt_color_dict[column])

x1, x2, y1, y2 = -610, -410, 0, -230 # specify the axis limits
axins.set_xlim(x1, x2) # apply the x-limits
axins.set_ylim(y1, y2) # apply the y-limits

plt.yticks(visible=False)
plt.xticks(visible=False)

mark_inset(ax, axins, loc1=2, loc2=3, fc="none", ec="0.5")

new_handles = []
for organ in train_test_tissues:
    col = train_test_tissues_to_columns[organ][0]
    color = tt_color_dict[col]
    patch = mpatches.Patch(color=color, label=organ)
    new_handles.append(patch)
    
lgd = ax.legend(handles=new_handles, loc=2, bbox_to_anchor=(1, 1), ncol=1)

output_path = train_test_dir + 'Zoomed PCA' + '.pdf'
plt.savefig(output_path, bbox_inches="tight", bbox_extra_artists=(lgd,))
plt.clf()

### Plot Healthy vs Diseased
Empty circles representing diseased tissues, filled circles representing healthy

In [28]:
hd_color_dict = healthy_diseased_column_to_color

hd_pca_df = pd.DataFrame(healthy_diseased_pca_data, index = original_healthy_diseased_col_names, 
                         columns = healthy_diseased_pca_labels)

fig = plt.figure(1)

ax = fig.add_subplot(111)

plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))

for column, stripped_col in zip(original_healthy_diseased_col_names, stripped_col_names):
    fill = 'none' if column.startswith('Diseased') else hd_color_dict[stripped_col]
    ax.scatter(hd_pca_df.PC1.loc[column], 
               hd_pca_df.PC2.loc[column],
               color=hd_color_dict[stripped_col], facecolors=fill)
    
output_path = healthy_diseased_dir + title + '.pdf'

new_handles = []

for organ in healthy_diseased_tissues:
    col = healthy_diseased_tissues_to_columns[organ][0]
    color = hd_color_dict[col]
    patch = mpatches.Patch(color=color, label=organ)
    line = Line2D(range(1), range(1), color="white", mec=color, marker='o', markerfacecolor=color,
                  label=organ)
    new_handles.append(patch)

### Append handles for open/closed circles (diseased/healthy)
blank_line = Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="white")
open_circle = Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="black",
                     mec='black', label='Closed Circles: Healthy')
closed_circle = Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="white",
                       mec='black', label='Open Circles: Diseased')

new_handles.append(blank_line)
new_handles.append(open_circle)
new_handles.append(closed_circle)
    
### Zoom in on tightly clustered section
axins = zoomed_inset_axes(ax, 6, loc=1) # axes, zoom-factor, location
for column, stripped_col in zip(original_healthy_diseased_col_names, stripped_col_names):
    fill = 'none' if column.startswith('Diseased') else hd_color_dict[stripped_col]
    axins.scatter(hd_pca_df.PC1.loc[column], 
                  hd_pca_df.PC2.loc[column], 
                  color=hd_color_dict[stripped_col], facecolors=fill)

x1, x2, y1, y2 = -800, -500, -400, -100 # specify the axis limits
axins.set_xlim(x1, x2) # apply the x-limits
axins.set_ylim(y1, y2) # apply the y-limits

plt.yticks(visible=False)
plt.xticks(visible=False)

mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="0.5")
    
lgd = ax.legend(handles=new_handles, loc=2, bbox_to_anchor=(1, 1), ncol=1)

fig.savefig(output_path, bbox_inches="tight", bbox_extra_artists=(lgd,))
fig.clf()