# Expression Quality Control (Part 2)

This is a template notebook for performing the final quality control on your organism's expression data. This requires a curated metadata sheet.

## Setup 

In [1]:
import itertools

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from os import path
from scipy import stats
from tqdm.notebook import tqdm

In [2]:
sns.set_style('ticks')

### Inputs

In [3]:

logTPM_file = path.join('..','data','raw_data','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/2_process_data/log_tpm.csv') # Enter log-TPM filename here
all_metadata_file = path.join('..','data','interim','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/3_quality control/NCTC8325_all.tsv') # Enter full metadata filename here
metadata_file = path.join('..','data','interim','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/3_quality control/NCTC8325_part1_curated.tsv') # Enter curated metadata filename here


### Load expression data

In [4]:
DF_log_tpm = pd.read_csv(logTPM_file,index_col=0).fillna(0)
print('Number of genes:',DF_log_tpm.shape[0])
print('Number of samples:',DF_log_tpm.shape[1])
DF_log_tpm.head()

Number of genes: 2767
Number of samples: 337


Unnamed: 0_level_0,DRX300641,DRX300642,DRX300643,ERX1222798,ERX1222799,ERX1222800,ERX2826862,ERX2826863,ERX2826864,ERX2826865,...,SRX9634010,SRX9634011,SRX9634012,SRX9634015,SRX9634016,SRX9634017,SRX9634018,SRX9634019,SRX9634020,SRX965931
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAOUHSC_00001,8.07596,7.913237,8.103539,8.23165,7.480144,7.833791,8.829448,8.93638,8.989139,8.942718,...,8.813741,8.461149,9.068514,8.446301,8.447756,8.781591,8.768944,8.52081,8.565999,8.558046
SAOUHSC_00002,8.491571,8.599078,8.401215,9.433116,8.726581,9.329891,8.962073,9.160966,9.290196,8.992331,...,9.0936,8.482311,8.937456,8.529349,8.782375,9.033129,9.01811,8.161739,8.052105,8.127758
SAOUHSC_00003,9.304385,8.849633,8.676897,7.504135,8.411847,8.858987,8.74796,8.976711,8.626191,8.742656,...,8.714124,8.020151,8.725003,8.142546,7.577228,7.810381,7.797447,7.668054,7.790997,7.671018
SAOUHSC_00004,9.119361,9.325414,8.971591,9.298017,9.538484,9.230754,8.412644,8.391423,8.605931,8.362605,...,9.841963,9.298584,9.323726,9.137359,8.794212,9.041146,9.083412,9.373125,9.467331,9.312853
SAOUHSC_00005,9.548104,9.573234,9.433046,9.791319,10.199322,9.799053,8.949354,8.885628,9.152409,8.849877,...,8.902671,8.890497,9.024617,8.65829,8.178211,8.63521,8.599545,9.773655,9.854817,9.969962


### Load metadata

In [5]:
DF_metadata = pd.read_csv(metadata_file,index_col=0,sep='\t')
print('Number of samples with curated metadata:',DF_metadata.shape[0])
DF_metadata.head()

Number of samples with curated metadata: 121


Unnamed: 0,Run,SortedStrain,project,condition,reference_condition,BioProject,TaxID,ReleaseDate,LoadDate,spots,...,dbgap_study_accession,Consent,RunHash,ReadHash,R1,R2,passed_fastqc,passed_reads_mapped_to_CDS,passed_global_correlation,skip
ERX2826862,DRR084259,HG001,p574312,574312_0,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:22,24073797,...,,public,69797B56C7226BE4D8C02A8AD3E91EEA,A522A281177112883E453D297DEA609F,,,True,True,True,
ERX2826863,DRR084260,HG001,p574312,574312_11,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:32:13,20841672,...,,public,B29BCF8E2E6CA207B96DE183FED55240,5FB98589C299944F785BC229F99525F0,,,True,True,True,
ERX2826864,DRR084261,HG001,p574312,574312_12,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:52,35553662,...,,public,82B66356ACA2779980ACC613173CC80C,F6BC964232E4CF153E5EC2238BAB44F0,,,True,True,True,
ERX2826865,DRR084262,HG001,p574312,574312_13,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:45,15933822,...,,public,1B57268C1A3178A404EC23FB71AE127A,1B3B69EC14A454C7D3E14C0D29CE8B5A,,,True,True,True,
ERX2826866,DRR084263,HG001,p574312,574312_14,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:52,30638434,...,,public,81163E8436CB2F2F282F0F589D5BAA12,7116DBC2E59DB778C591F0D3E11DE036,,,True,True,True,


In [6]:
DF_metadata_all = pd.read_csv(all_metadata_file,index_col=0,sep='\t')

## Remove samples due to poor metadata

After curation, some samples either did not have enough replicates or metadata to warrant inclusion in this database.

In [7]:
DF_metadata_passed_step4 = DF_metadata[~DF_metadata.skip.fillna(False)].copy()
print('New number of samples with curated metadata:',DF_metadata_passed_step4.shape[0])
DF_metadata_passed_step4.head()

New number of samples with curated metadata: 121


Unnamed: 0,Run,SortedStrain,project,condition,reference_condition,BioProject,TaxID,ReleaseDate,LoadDate,spots,...,dbgap_study_accession,Consent,RunHash,ReadHash,R1,R2,passed_fastqc,passed_reads_mapped_to_CDS,passed_global_correlation,skip
ERX2826862,DRR084259,HG001,p574312,574312_0,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:22,24073797,...,,public,69797B56C7226BE4D8C02A8AD3E91EEA,A522A281177112883E453D297DEA609F,,,True,True,True,
ERX2826863,DRR084260,HG001,p574312,574312_11,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:32:13,20841672,...,,public,B29BCF8E2E6CA207B96DE183FED55240,5FB98589C299944F785BC229F99525F0,,,True,True,True,
ERX2826864,DRR084261,HG001,p574312,574312_12,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:52,35553662,...,,public,82B66356ACA2779980ACC613173CC80C,F6BC964232E4CF153E5EC2238BAB44F0,,,True,True,True,
ERX2826865,DRR084262,HG001,p574312,574312_13,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:45,15933822,...,,public,1B57268C1A3178A404EC23FB71AE127A,1B3B69EC14A454C7D3E14C0D29CE8B5A,,,True,True,True,
ERX2826866,DRR084263,HG001,p574312,574312_14,574312_0,PRJDB5479,1280,2019-01-18 16:30:20,2019-01-18 16:31:52,30638434,...,,public,81163E8436CB2F2F282F0F589D5BAA12,7116DBC2E59DB778C591F0D3E11DE036,,,True,True,True,


### Check curation
Since manual curation is error-prone, we want to make sure that all samples have labels for their project and condition. In addition, there should only be one reference condition in each project, and it should be in the project itself.

Any samples that fail these checks will be printed below.

In [8]:
assert(DF_metadata_passed_step4.project.notnull().all())
assert(DF_metadata_passed_step4.condition.notnull().all())

for name,group in DF_metadata_passed_step4.groupby('project'):
    ref_cond = group.reference_condition.unique()
    
    # Ensure that there is only one reference condition per project
    if not len(ref_cond) == 1:
        print('Multiple reference conditions for:, name')
    
    # Ensure the reference condition is in fact in the project
    ref_cond = ref_cond[0]
    if not ref_cond in group.condition.tolist():
        print('Reference condition not in project:', name)

Next, make a new column called ``full_name`` that gives every experimental condition a unique, human-readable identifier.

In [9]:
DF_metadata_passed_step4['full_name'] = DF_metadata_passed_step4['project'].str.cat(DF_metadata_passed_step4['condition'],sep=':')

### Remove samples with only one replicate

First, find sample names that have at least two replicates.

In [None]:
counts = DF_metadata_passed_step4.full_name.value_counts()
keep_samples = counts[counts >= 2].index
print(keep_samples[:5])

Only keep these samples

In [None]:
DF_metadata_passed_step4 = DF_metadata_passed_step4[DF_metadata_passed_step4.full_name.isin(keep_samples)]
print('New number of samples with curated metadata:',DF_metadata_passed_step4.shape[0])
DF_metadata_passed_step4.head()

### Save this information to the full metadata dataframe

In [None]:
DF_metadata_all['passed_curation'] = DF_metadata_all.index.isin(DF_metadata_passed_step4.index)

## Check correlations between replicates

### Remove failed data from log_tpm files

In [None]:
DF_log_tpm = DF_log_tpm[DF_metadata_passed_step4.index]

### Compute Pearson R Score

Biological replicates should have a Pearson R correlation above 0.95. For samples with more than 2 replicates, the replicates must have R >= 0.95 with at least one other replicate or it will be dropped. The correlation threshold can be changed below:

In [None]:
rcutoff = 0.95

The following code computes correlations between all samples and collects correlations between replicates and non-replicates.

In [None]:
rep_corrs = {}
rand_corrs = {}

num_comparisons = len(DF_metadata_passed_step4)*(len(DF_metadata_passed_step4)-1)/2

for exp1,exp2 in tqdm(itertools.combinations(DF_metadata_passed_step4.index,2),total=num_comparisons):
    if DF_metadata_passed_step4.loc[exp1,'full_name'] == DF_metadata_passed_step4.loc[exp2,'full_name']:
        rep_corrs[(exp1,exp2)] = stats.pearsonr(DF_log_tpm[exp1],DF_log_tpm[exp2])[0]
    else:
        rand_corrs[(exp1,exp2)] = stats.pearsonr(DF_log_tpm[exp1],DF_log_tpm[exp2])[0]

Correlations can be plotted on a histogram

In [None]:
fig,ax = plt.subplots(figsize=(5,5))
ax2 = ax.twinx()
ax2.hist(rep_corrs.values(),bins=50,range=(0.2,1),alpha=0.8,color='green',linewidth=0)
ax.hist(rand_corrs.values(),bins=50,range=(0.2,1),alpha=0.8,color='blue',linewidth=0)
ax.set_title('Pearson R correlation between experiments',fontsize=14)
ax.set_xlabel('Pearson R correlation',fontsize=14)
ax.set_ylabel('Different Conditions',fontsize=14)
ax2.set_ylabel('Known Replicates',fontsize=14)

med_corr = np.median([v for k,v in rep_corrs.items()])
print('Median Pearson R between replicates: {:.2f}'.format(med_corr))

Remove samples without any high-correlation replicates

In [None]:
dissimilar = []
for idx, grp in DF_metadata_passed_step4.groupby('full_name'):
    ident = np.identity(len(grp))
    corrs = (DF_log_tpm[grp.index].corr() - ident).max()
    dissimilar.extend(corrs[corrs<rcutoff].index)

# Save this information in both the original metadata dataframe and the new metadata dataframe
DF_metadata_all['passed_replicate_correlations'] = ~DF_metadata_all.index.isin(dissimilar)
DF_metadata_passed_step4['passed_replicate_correlations'] = ~DF_metadata_passed_step4.index.isin(dissimilar)

In [None]:
DF_metadata_final = DF_metadata_passed_step4[DF_metadata_passed_step4['passed_replicate_correlations']]
print('# Samples that passed replicate correlations:',len(DF_metadata_final))

## Check that reference conditions still exist
If a reference condition was removed due to poor replicate correlations, a new reference condition needs to be defined.

Again, any samples that fail these checks will be printed below.

In [None]:
project_exprs = []
for name,group in DF_metadata_final.groupby('project'):
    
    # Get reference condition
    ref_cond = group.reference_condition.iloc[0]
    
    # Ensure the reference condition is still in the project
    if ref_cond not in group.condition.tolist():
        print('Reference condition missing from:', name)
    
    # Check that each project has at least two conditions (a reference and at least one test condition)
    if len(group.condition.unique()) <= 1:
        print('Only one condition in:', name)

If necessary, choose a new condition for failed projects and re-run notebook.

## Normalize dataset to reference conditions

In [11]:
DF_metadata_final = DF_metadata_passed_step4.copy()

In [12]:
DF_log_tpm_final = DF_log_tpm[DF_metadata_final.index]

In [13]:
project_exprs = []
for name,group in DF_metadata_final.groupby('project'):
    
    # Get reference condition
    ref_cond = group.reference_condition.iloc[0]
    
    # Get reference condition sample ids
    ref_samples = group[group.condition == ref_cond].index
    
    # Get reference condition expression
    ref_expr = DF_log_tpm_final[ref_samples].mean(axis=1)
    
    # Subtract reference expression from project
    project_exprs.append(DF_log_tpm_final[group.index].sub(ref_expr,axis=0))

DF_log_tpm_norm = pd.concat(project_exprs,axis=1)

## Save final datasets

In [14]:
logTPM_qc_file = path.join('..','data','processed_data','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/3_quality control/log_tpm.csv')
logTPM_norm_file = path.join('..','data','processed_data','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/3_quality control/log_tpm_norm.csv')
final_metadata_file = path.join('..','data','processed_data','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/3_quality control/metadata.tsv')
final_metadata_all_file = path.join('..','data','interim','/Users/louxuwen/Desktop/Documents/GitHub/BENG212_S_aureus/3_quality control/metadata_qc_part2_all.tsv')

DF_log_tpm_final.to_csv(logTPM_qc_file)
DF_log_tpm_norm.to_csv(logTPM_norm_file)
DF_metadata_final.to_csv(final_metadata_file, sep='\t')
DF_metadata_all.to_csv(final_metadata_all_file, sep='\t')