In [None]:
# Script to predict whether a cell line will differentiate
# Takes as input:
# i) expression data
# ii) gene expression averages and standard deviations
# iii) logistic regression coefficients
#
# Returns predictions

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os.path
import math
from sklearn.linear_model import LogisticRegression

In [None]:
# Setup

#Not z-score file!
expression_data_file = 'retention_group_1_2_log2.tsv.gz'



expression_mean_std_file = 'salmon.merged.gene_tpm.log2_tmp_plus_1.retention_group_filtered.means_stds.tsv.gz'
coefficients_file = 'classifier_coefficients.tsv.gz'

#Number of logistic regressions to perform per sample
number_of_log_reg = 1000


In [None]:
# Error flag
error_flag = 0

In [None]:
# Import data
print("Reading in expression file: " + expression_data_file)
expression_data = pd.read_csv(expression_data_file, sep="\t")
print(f'\tNumber of cell lines: {expression_data.shape[1] - 2}')
print(f'\tNumber of genes: {expression_data.shape[0]}')
print()

print("Reading in gene mean / std expression results file: " + expression_mean_std_file)
expression_mean_std_data = pd.read_csv(expression_mean_std_file, sep="\t")
print(f'\tNumber of genes: {expression_mean_std_data.shape[0]}')
print()

print("Reading in logistic regression coefficients and intercepts: " + coefficients_file)
coefficients_data = pd.read_csv(coefficients_file, sep="\t")
print(f'\tNumber of unique coefficients + intercept: {coefficients_data.loc[:, ["gene_id"]].drop_duplicates().shape[0]}')

In [None]:
# Filter only relevant genes for expression data 
#(check all relevant gene present in input)
relevant_genes = (coefficients_data
                      .loc[:, 'gene_id']
                      .drop_duplicates()
                      .iloc[1:]          #Don't include intercept
                 )

relevant_genes = pd.DataFrame(relevant_genes)

expression_data = pd.merge(relevant_genes, expression_data,
                            how='left',
                            on='gene_id'
                           )

if expression_data.iloc[:, 2:].isnull().values.any():
    print('Coefficient needed but not in expression file - this should not happen!')
    error_flag = 1

In [None]:
# Convert expression data to z-scores
expression_data_z_scores = pd.merge(expression_mean_std_data, expression_data,
                            how='right',
                            on='gene_id'
                           )

expression_data_z_scores = expression_data_z_scores.reset_index(drop=True)

expression_pop_std = expression_data_z_scores['expression_pop_std']   #For use later

# Subtract average
z_score_gene_ids = expression_data_z_scores.loc[:, 'gene_id']   #For use later

expression_data_z_scores = (expression_data_z_scores
                                .iloc[:, 4:]
                                .sub(expression_data_z_scores['expression_mean'], axis=0)
                           )

#Divide by population standard deviation
expression_data_z_scores = expression_data_z_scores.div(expression_pop_std, axis=0)

# Add back in gene ids
expression_data_z_scores = pd.concat([z_score_gene_ids, expression_data_z_scores], axis=1)
expression_data_z_scores = expression_data_z_scores.rename({0: 'gene_id'}, axis=1)

#Tidy up
del([expression_data, expression_pop_std, z_score_gene_ids])


In [None]:
#Create log_reg_id for selecting a different set of coefficients and intercepts each time
coefficients_data['log_reg_id'] = coefficients_data['iteration'].astype(str) + '_' + coefficients_data['accession'].astype(str)
coefficients_data = coefficients_data[['log_reg_id', 'iteration', 'accession', 'gene_id', 'coefficient']]

total_number_log_reg_performed = (coefficients_data
                                    .loc[:, ['log_reg_id']]
                                    .drop_duplicates()
                                    .shape[0]
                                 )
                                    
if(number_of_log_reg <= total_number_log_reg_performed):
    print(f'User sepecified number of logistic regressions to perform: {number_of_log_reg}')
else:
    print(f'User specified number of logistic regressions to perform ({number_of_log_reg}) exceeds maximum possible ({total_number_log_reg_performed}')
    number_of_log_reg = total_number_log_reg_performed
    print(f'Setting  number of logistic regressions to perform to: {number_of_log_reg}')

In [None]:
# Perform logistic regression
accessions = expression_data_z_scores.columns.to_series()[1:]
#iterations = coefficients_data['iteration'].drop_duplicates()

logistic_regression_results = pd.DataFrame()
accessions_processed = 0

#accessions = ['ERR1203457', 'ERR1203457']
#accessions = ['ERR1203455']
#iterations = [1]
#iterations = [1, 2]

for accession in accessions:
    
    if(accessions_processed % 25 == 0):
        print(f'{accessions_processed} accessions processed')
    
    subsample_log_reg_ids_to_use = (coefficients_data
                            .loc[:, ['log_reg_id']]
                            .drop_duplicates()
                            .sample(n=number_of_log_reg)
                            .reset_index(drop=True)
                         )

    for log_reg_id_to_use in subsample_log_reg_ids_to_use['log_reg_id']:
        
        #Extract coefficients data
        coefficients_data_of_interest = coefficients_data.query('log_reg_id == @log_reg_id_to_use')
        
        # Extract intercept
        intercept = (coefficients_data_of_interest
                        .query('gene_id == "INTERCEPT"')
                        .loc[:, 'coefficient']
                        .iloc[0]
                    )
          
        intercept_index = (coefficients_data_of_interest
                            .query('gene_id == "INTERCEPT"')
                            .index
                            .to_series()
                            .iloc[0]
                          )
        
        coefficients_data_of_interest = coefficients_data_of_interest.drop(labels=intercept_index)
              
        # Combine with z-score data
        accession_expression_zscore = expression_data_z_scores.loc[:, ['gene_id', accession]]
        accession_expression_zscore = pd.merge(coefficients_data_of_interest, 
                                               accession_expression_zscore,
                                               how='left',
                                               on='gene_id'
                                              )
        
        # Perform logistic regression calculation
        #P(t) = 1 / (1 + e^(-t))
        accession_expression_zscore['score'] = accession_expression_zscore['coefficient'] * accession_expression_zscore.iloc[:, -1]
        t =  accession_expression_zscore['score'].sum() + intercept
        p = 1 / (1 + (math.e ** (-t)))
        
        current_result = pd.DataFrame(
                            [{'accession' : accession,
                             'log_reg_id' : log_reg_id_to_use,
                             'p' : p}])
        
        logistic_regression_results = pd.concat([logistic_regression_results, current_result], axis=0)
        
    accessions_processed = accessions_processed + 1
        
logistic_regression_results = logistic_regression_results.reset_index(drop=True)

In [None]:
# Summarise results

logistic_regression_means = (logistic_regression_results
                                .loc[:, ['accession', 'p']]
                                .groupby(by='accession')
                                .mean()
                                .reset_index()
                              )

logistic_regression_means = logistic_regression_means.rename({'p': 'p_average'}, axis=1)

logistic_regression_stdp = (logistic_regression_results
                                .loc[:, ['accession', 'p']]
                                .groupby(by='accession')
                                .std(ddof=0)
                                .reset_index()
                              )

logistic_regression_stdp = logistic_regression_stdp.rename({'p': 'p_stdp'}, axis=1)

logistic_regression_summary = pd.merge(logistic_regression_means, logistic_regression_stdp,
                                           how='left',
                                           on='accession'
                                      )
logistic_regression_summary['differentiated'] = np.where(logistic_regression_summary['p_average'] > 0.5, 1, 0)
                                                         

In [None]:
# Plot results
#sns.scatterplot(data=logistic_regression_summary, x="p_average", y="p_stdp")

# Show standard deviation

#Plot scatterplot of results vs expected
plt.figure(figsize=(7,7))
sns.color_palette("dark")
sns.scatterplot(x="p_stdp", 
            y="p_average",
            data=logistic_regression_summary)

plt.title('Classifier p-values vs standard deviation')
plt.xlabel('Standard deviation predicted p value')
plt.ylabel('Mean predicted p value')
plt.axhline(0.5, color='r', linestyle='--')
plt.xlim(0, 1)
plt.ylim(0, 1)

# Put the legend out of the figure
plt.show()

In [None]:
# Write out results
outfile = os.path.basename(expression_data_file)
outfile = outfile.replace('.tsv.gz', '')
outfile = f'{outfile}.predicted.tsv.gz'
print(f'Writing results to: {outfile}')
logistic_regression_results.to_csv(outfile, index=False, compression='gzip', sep="\t")

outfile = os.path.basename(expression_data_file)
outfile = outfile.replace('.tsv.gz', '')
outfile = f'{outfile}.predicted_summary.tsv.gz'
print(f'Writing out gene expression means and standard deviations to: {outfile}')
logistic_regression_summary.to_csv(outfile, index=False, compression='gzip', sep="\t")

In [None]:
# Done unless error

In [None]:
if error_flag == 0:
    print('Done')
else:
    print('Finshed, but with errors!')