# Mechanism of Action (MoA) EDA kernel

The definition of mechanism of action given by the [National Cancer Institute](https://www.cancer.gov/publications/dictionaries/cancer-terms/def/mechanism-of-action) is the following: In medicine, the term mechanism of action is used to describe how a drug or other substance produces an effect in the body. For example, a drug’s mechanism of action could be how it affects a specific target in a cell, such as an enzyme, or a cell function, such as cell growth. Knowing the mechanism of action of a drug may help provide information about the safety of the drug and how it affects the body. It may also help identify the right dose of a drug and which patients are most likely to respond to treatment. Also called MOA.

In this kernel:
 1. Exploration of general statistical characteristics of the training dataset.
 2. Univariate analysis of the data features.
 3. Bi- and multi- variate analysis of the features and the relation between training features and targets. 
 4. Principal component analysis for the gene and cell viability features.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install seaborn==0.11.0

In [None]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler

%matplotlib inline

In [None]:
sb.__version__

In [None]:
# Importing the datasets

train_feat_df = pd.read_csv('../input/lish-moa/train_features.csv')
test_feat_df = pd.read_csv('../input/lish-moa/test_features.csv')

scored_train_targets_df = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
nscored_train_targets_df = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

In [None]:
print('Training data sample')
train_feat_df.head()

In [None]:
print('Training targets sample')
scored_train_targets_df.head()

# Exploratory Data Analysis
## Training features

In [None]:
# Creating palettes

sb.set_style('darkgrid')                                # darkgrid, whitegrid, dark, white, ticks
pal_s_d = sb.cubehelix_palette(3, rot=-.25, light=.4)
pal_l_d = sb.cubehelix_palette(20, rot=-.25, light=.4)
pal_s_c = sb.cubehelix_palette(3, rot=-.25, light=.8)
pal_l_c = sb.cubehelix_palette(20, rot=-.25, light=.8)

In [None]:
print('Quick overview of the statistical description of the training dataset')
train_feat_df.describe()

In [None]:
cp_type_count = train_feat_df.cp_type.value_counts().reset_index().rename(columns={'index':'cp_type','cp_type':'count'})
cp_dose_count = train_feat_df.cp_dose.value_counts().reset_index().rename(columns={'index':'cp_dose','cp_dose':'count'})
cp_time_count = train_feat_df.cp_time.value_counts().reset_index().rename(columns={'index':'cp_time','cp_time':'count'})

print('Exploration of the the treatment type, dose, and duration')
plt.figure(figsize=(12,4))
plt.subplot(131)
sb.barplot(data=cp_type_count, x='cp_type', y='count', palette=pal_s_d)
plt.title('Treatment type count')
plt.subplot(132)
sb.barplot(data=cp_dose_count, x='cp_dose', y='count', palette=pal_s_d)
plt.title('Treatment dose type count')
plt.subplot(133)
sb.barplot(data=cp_time_count, x='cp_time', y='count', palette=pal_s_d)
plt.title('Treatment duration count')
plt.tight_layout()

In [None]:
g = sb.catplot(data=train_feat_df, x='cp_type',hue='cp_dose', col='cp_time', kind="count",
               height=4, aspect=.8, palette=pal_s_d)
g.fig.suptitle('Sample count by treatment duration, type, and dose type', fontsize=15, y=1.1);

### Gene expression data

In [None]:
gene_cols = train_feat_df.columns[4:-100]
gene_data = train_feat_df[gene_cols]

In [None]:
print('Comparaision of the distribution of a sample of gene features')
plt.figure(figsize=(14,20))
m=5; n=4; k=0
for i in range(n*m):
    plt.subplot(m,n,i+1)
    sb.histplot(gene_data.iloc[:,[k,k+35]], stat="density", element='poly', kde=True )
    if (i%4) != 0:
        plt.ylabel('')
    k+=35

In [None]:
print('Statistical description of the gene feature destributions')
gene_desc = gene_data.describe()
display(gene_desc)

In [None]:
print('Distribution of the mean and std across the gene features')
plt.figure(figsize=(11,4))
plt.subplot(121)
sb.histplot(gene_desc.loc['mean',:])
plt.title('Distribution of mean values of gene distributions')
plt.subplot(122)
sb.histplot(gene_desc.loc['std',:])
plt.title('Distribution of std values of gene distributions');

In [None]:
print('Inspection of gene features with highest and lowest mean value')

min_mean = min(gene_desc.loc['mean',:])
max_mean = max(gene_desc.loc['mean',:])

lowest_mean_gene = gene_desc.T.query("mean==@min_mean")
print("Gene with lowest distribution mean")
display(lowest_mean_gene)
highest_mean_gene = gene_desc.T.query("mean==@max_mean")
print("Gene with highest distribution mean")
display(highest_mean_gene)

plt.figure(figsize=(11,4))
plt.subplot(121)
sb.histplot(gene_data.iloc[:,[370]], stat="density", element='poly', kde=True )
plt.title("Distribution of the gene with lowest mean value")
plt.subplot(122)
sb.histplot(gene_data.iloc[:,[707]], stat="density", element='poly', kde=True, palette='Reds' )
plt.title("Distribution of the gene with highest mean value");

### Cell viability data

In [None]:
cell_via_cols = train_feat_df.columns[-100:]
cell_via_data = train_feat_df[cell_via_cols]

In [None]:
print('Comparaision of the distribution of a sample of cell viability features')
plt.figure(figsize=(14,20))
m=5; n=4; k=0
for i in range(n*m):
    plt.subplot(m,n,i+1)
    sb.histplot(cell_via_data.iloc[:,[k,min(k+5,99)]], stat="density", element='poly', kde=True)
    if (i%4) != 0:
        plt.ylabel('')
    k+=5

In [None]:
print('Statistical description of the cell viability feature destributions')
cell_desc = cell_via_data.describe()
display(cell_desc)

In [None]:
print('Distribution of the mean and std across the cell viability features')
plt.figure(figsize=(11,4))
plt.subplot(121)
sb.histplot(cell_desc.loc['mean',:])
plt.title('Distribution of mean values of cell viability distributions')
plt.subplot(122)
sb.histplot(cell_desc.loc['std',:])
plt.title('Distribution of std values of cell viability distributions');

In [None]:
print('Inspection of cell viability features with highest and lowest mean value')

min_mean = min(cell_desc.loc['mean',:])
max_mean = max(cell_desc.loc['mean',:])

lowest_mean_cell = cell_desc.T.query("mean==@min_mean")
print("Cell with lowest distribution mean")
display(lowest_mean_cell)
highest_mean_cell = cell_desc.T.query("mean==@max_mean")
print("Cell with highest distribution mean")
display(highest_mean_cell)

plt.figure(figsize=(11,4))
plt.subplot(121)
sb.histplot(cell_via_data.iloc[:,[65]], stat="density", element='poly', kde=True )
plt.title("Cell viability with lowest mean value distribution")
plt.subplot(122)
sb.histplot(cell_via_data.iloc[:,[74]], stat="density", element='poly', kde=True, palette='Reds' )
plt.title("Cell viability with highest mean value distribution");

## Scored training targets

In [None]:
print('Training targets data sample')
scored_train_targets_df.head()

In [None]:
i=0
target_values = pd.DataFrame({'index':[0,1]})
for col in scored_train_targets_df.columns:
    temp = scored_train_targets_df[str(col)].value_counts().reset_index()
    if i>0:
        target_values = target_values.merge(temp, how='left', on='index')
    i+=1
target_values.set_index('index', inplace=True)

In [None]:
print('Target MoA labels sum')
target_values = target_values.T.reset_index().rename(columns={'index':'MoA'})
target_values

In [None]:
print('Statistical description of the MoA label sum ditributions')
target_values.describe()

In [None]:
print('MoA targets with most/least activations across samples')

most_active = target_values.sort_values(by=[1], ascending=False)
most_active = most_active.set_index('MoA').iloc[0:10,1].reset_index()

least_active = target_values.sort_values(by=[1])
least_active = least_active.set_index('MoA').iloc[0:10,1].reset_index()

plt.figure(figsize=(12,5))
plt.suptitle('Top 10 most/least activated MoA',fontsize=15, y=1.05)
ax1 = plt.subplot(121)
sb.barplot(y='MoA', x='Activation count', data=most_active.rename(columns={1:'Activation count'}), palette=pal_l_d, ax=ax1)
ax2 = plt.subplot(122)
sb.barplot(y='MoA', x='Activation count', data=least_active.rename(columns={1:'Activation count'}), palette=pal_l_c, ax=ax2)
plt.tight_layout()

In [None]:
master_df = train_feat_df.copy()
master_df = master_df.merge(scored_train_targets_df, how='left', on='sig_id')

In [None]:
master_df.head()

In [None]:
target_names = list(master_df.columns[-206:])

sample_activation_state = (master_df[target_names]!=0).any(axis=1)
sample_activation_count = (master_df[target_names]!=0).sum(axis=1)
top_10_most_act_samples = sample_activation_count.sort_values(ascending=False)
top_10_most_act_samples = pd.concat([master_df.iloc[top_10_most_act_samples.index[0:10], 0], top_10_most_act_samples[0:10]],
                                    axis=1)

In [None]:
print('Inspection of samples MoA targets')

plt.figure(figsize=(12,8))
plt.subplot(211)
ax1 = sb.countplot(data=sample_activation_count.rename('MoA_activation_count').reset_index(),
             x='MoA_activation_count', palette=pal_l_c)
for p in ax1.patches:
    percentage ='{:.2f}%'.format(p.get_height()*100/16844)
    width, height =p.get_width(),p.get_height()
    x=p.get_x()+width/3.6
    y=p.get_y()+height
    ax1.annotate(percentage,(x,y), fontsize=14, color='black')
plt.title('The sample count of the total number of MoA activations')
plt.subplot(223)
ax2 = sb.countplot(data=sample_activation_state.reset_index().rename(columns={0:'Activation state'}),
             x='Activation state', palette=pal_s_d)
for p in ax2.patches:
    percentage ='{:.2f}%'.format(p.get_height()*100/16844)
    width, height =p.get_width(),p.get_height()
    x=p.get_x()+width/2.9
    y=p.get_y()+height-1000
    ax2.annotate(percentage,(x,y), fontsize=14, color='white')
plt.title('Number of samples by activation state')
plt.subplot(224)
sb.barplot(data=top_10_most_act_samples.rename(columns={0:'MoA activation count'}),
             x='MoA activation count', y='sig_id', palette=pal_l_d, estimator=sum)
plt.title('Top 10 samples with highest MoA activations')
plt.tight_layout()

**Activation by treatment features**

In [None]:
master_df = master_df.merge(sample_activation_count.rename('MoA_activation_count'), how='left',
                            left_index=True, right_index=True)

In [None]:
basic_feat = master_df.loc[:,['sig_id', 'cp_type', 'cp_dose', 'cp_time', 'MoA_activation_count']]
display(basic_feat)

In [None]:
print('Relation between different features and the MoA target activations')

plt.figure(figsize=(13,12))

plt.subplot(221)
ax = sb.barplot(data=basic_feat.groupby('cp_type').sum()['MoA_activation_count'].reset_index(),
                x='cp_type', y='MoA_activation_count', palette=pal_s_d)
for p in ax.patches:
    percentage ='{:}%'.format(p.get_height()*100/16844)
    width, height =p.get_width(),p.get_height()
    x=p.get_x()+width-0.54
    y=p.get_y()+height+100
    ax.annotate(percentage,(x,y), fontsize=14)
plt.title('Total MoA activation by sample type')

plt.subplot(222)
ax2 = sb.barplot(data=basic_feat.groupby('cp_dose').sum()['MoA_activation_count'].reset_index(),
                x='cp_dose', y='MoA_activation_count', palette=pal_s_d)
for p in ax2.patches:
    percentage ='{:.0f}%'.format(p.get_height()*100/16844)
    width, height =p.get_width(),p.get_height()
    x=p.get_x()+width/2.5
    y=p.get_y()+height/1.1
    ax2.annotate(percentage,(x,y), fontsize=14, color='white')
plt.title('Total MoA activation by sample dose')
    
plt.subplot(223)
ax3 = sb.barplot(data=basic_feat.groupby('cp_time').sum()['MoA_activation_count'].reset_index(),
                x='cp_time', y='MoA_activation_count', palette=pal_s_d)
for p in ax3.patches:
    percentage ='{:.0f}%'.format(p.get_height()*100/16844)
    width, height =p.get_width(),p.get_height()
    x=p.get_x()+width/2.5
    y=p.get_y()+height/1.1
    ax3.annotate(percentage,(x,y), fontsize=14, color='white')
plt.title('Total MoA activation by sample duration')

plt.subplot(224)
ax3 = sb.barplot(data=basic_feat, x='cp_time', y='MoA_activation_count', hue='cp_dose',ci=None, estimator=sum, palette=pal_s_c)
plt.title('Total MoA activation by sample duration and dose');

## PCA gene features

In [None]:
print('Gene features')
gene_data

In [None]:
# Scaling data

scaled_gene_data = scale(gene_data)

In [None]:
pca1 = PCA(0.95)
pca1.fit(scaled_gene_data)

In [None]:
pca_gene = pca1.transform(scaled_gene_data)
per_var = np.round(pca1.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

In [None]:
print('Top 10 principal components for the gene features')

plt.figure(figsize=(8,6))
sb.pointplot(x=list(range(1,11)), y=per_var[0:10])
sb.barplot(x=list(range(1,11)), y=per_var[0:10], tick_label=labels[0:10], palette=reversed(pal_l_c), )
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot of the first 10 components');

In [None]:
print('2D representation of gene features using the first couple of principal components')

transformed_gene_feats = pd.DataFrame(pca_gene, columns=labels)
gene_temp = basic_feat.merge(transformed_gene_feats, how='left', left_index=True, right_index=True)
plt.figure(figsize=(12,30))
plt.subplot(411)
sb.scatterplot(data=gene_temp, x='PC1', y='PC2', hue='cp_dose', palette=pal_s_c[0:2])
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.subplot(412)
sb.scatterplot(data=gene_temp, x='PC1', y='PC2', hue='cp_type', palette=pal_s_c[0:2])
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.subplot(413)
sb.scatterplot(data=gene_temp, x='PC1', y='PC2', hue='cp_time', size='cp_time', palette=pal_s_c[0:3])
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.subplot(414)
sb.scatterplot(data=gene_temp, x='PC1', y='PC2', hue='MoA_activation_count', size='MoA_activation_count',palette="Spectral")
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]));

In [None]:
# 3d visualization of the genes by cp_dose, cp_type, and cp_time in the new space 
# generated by the first 3 principal components

#px.scatter_3d(gene_temp, x='PC1', y='PC2', z='PC3', color='cp_dose' ,opacity=0.2)
#px.scatter_3d(gene_temp, x='PC1', y='PC2', z='PC3', color='cp_type' ,opacity=0.2)
#px.scatter_3d(gene_temp, x='PC1', y='PC2', z='PC3', color='cp_time' ,opacity=0.2)

## PCA cell features

In [None]:
print('Cell viability features')
cell_via_data

In [None]:
# Scaling data 

scaled_cell_data = scale(cell_via_data)

In [None]:
pca2 = PCA(.95)
pca2.fit(scaled_cell_data)

In [None]:
pca_cell = pca2.transform(scaled_cell_data)
per_var = np.round(pca2.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

In [None]:
print('Top 10 principal components for the cell viability features')
plt.figure(figsize=(8,6))
sb.pointplot(x=list(range(1,11)), y=per_var[0:10])
sb.barplot(x=list(range(1,11)), y=per_var[0:10], tick_label=labels[0:10], palette=reversed(pal_l_c))
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot of the first 10 components');

In [None]:
print('2D representation of cell viability features using the first couple of principal components')

transformed_cell_feats = pd.DataFrame(pca_cell, columns=labels)
cell_temp = basic_feat.merge(transformed_cell_feats, how='left', left_index=True, right_index=True)
plt.figure(figsize=(12,30))
plt.subplot(411)
sb.scatterplot(data=cell_temp, x='PC1', y='PC2', hue='cp_dose', palette=pal_s_c[0:2])
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.subplot(412)
sb.scatterplot(data=cell_temp, x='PC1', y='PC2', hue='cp_type', palette=pal_s_c[0:2])
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.subplot(413)
sb.scatterplot(data=cell_temp, x='PC1', y='PC2', hue='cp_time', size='cp_time', palette=pal_s_c[0:3])
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.subplot(414)
sb.scatterplot(data=cell_temp, x='PC1', y='PC2', hue='MoA_activation_count', size='MoA_activation_count',palette="Spectral")
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]));

In [None]:
# 3d visualization of the cell viability by cp_dose, cp_type, and cp_time in the new space 
# generated by the first 3 principal components

#px.scatter_3d(cell_temp, x='PC1', y='PC2', z='PC3', color='cp_dose' ,opacity=0.2)
#px.scatter_3d(cell_temp, x='PC1', y='PC2', z='PC3', color='cp_type' ,opacity=0.2)
#px.scatter_3d(cell_temp, x='PC1', y='PC2', z='PC3', color='cp_time' ,opacity=0.2)