# Run Logistic Regression Classifier Model to Predict Expression Status

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import math

## Setup(edit as required)

In [None]:
# Setup (edit as required)
expression_datafile = 'classifier_input.tsv.gz'
differentiation_threshold = 0.2  #Set in paper
coefficents_file = 'logistic_regression_coefficients.tsv.gz'
collated_coefficients_file = 'collated_logistic_regression_coefficients.tsv.gz'

## Data overview and QC

In [None]:
# Read in data
expression_data = pd.read_csv(expression_datafile, sep="\t")
print("Reading in: " + expression_datafile)
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

In [None]:
# Log10 tpm histogram
plt.figure(figsize=(8,8))
plt.hist(expression_data['log10_tpm'], bins=100)
plt.xlabel('Log10(tpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plot=sns.ecdfplot(data=expression_data, 
                  x="log10_tpm", 
                  hue="Accession",
                 legend=False)

In [None]:
# Since we are using an already established logistic regression model, the expression
# DataFrame my be simplified to only include relevant data
expression_data = (expression_data
        .loc[:, ['Accession', 'target_id', 'Cell_line', 'log10_tpm']]
                  )

In [None]:
#Create a simplified file in user-friendly format for analysis in other tools (e.g. R, Excel)

#Log10(TPM+1)
data_for_external_analysis = expression_data
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Accession'] + "_" + data_for_external_analysis['Cell_line']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'log10_tpm']]
        .pivot(index="target_id", columns='Cell_Sample', values='log10_tpm')
    )

data_for_external_analysis['Cell_Sample'] = data_for_external_analysis.index
first_column = data_for_external_analysis.pop('Cell_Sample')
data_for_external_analysis.insert(0, 'Cell_Sample', first_column)


#Write out the result
external_analysis_file = 'external_analysis_data_log10_tpm.tsv.gz'
print("Writing results to: " + external_analysis_file)
data_for_external_analysis.to_csv(external_analysis_file, index=False, compression='gzip', sep="\t")

del(data_for_external_analysis)
del(first_column)

## Run Logistic Regression Model

In [None]:
# Import coefficients
print("Reading in coefficients file: " + coefficents_file)
coefficients = pd.read_csv(coefficents_file, sep="\t")

In [None]:
# Extract intercept
if(coefficients.loc[0, 'target_id'] != 'INTERCEPT'):
    print("Intercept not found in " + coefficents_file)
intercept = coefficients.loc[0, 'coefficient']
    
coefficients.drop(axis=0, index=0, inplace=True)    


In [None]:
# Merge Coefficients data with expression data
coefficient_transcripts = (coefficients
                               .loc[:, "target_id"]
                               .drop_duplicates()
                          )

expression_transcripts = (expression_data
                            .loc[:, "target_id"]
                            .drop_duplicates()
                          )

not_found_coefficients = coefficient_transcripts[coefficient_transcripts.isin(expression_transcripts)==False]

if(not_found_coefficients.size == 0):
    print("Good news: all logistic regression coefficients found in expression data")
    expression_data_key_transcripts = pd.merge(expression_data, coefficients, how="inner", on="target_id")
else:
    print("Warning: coefficients missing in input expression data:")
    print(not_found_coefficients)

In [None]:
# Calculate P(differentiated) for each accession

# Calculate Z-scores using the mean and stdev from the pre-computed logistric regression model
expression_data_key_transcripts['z_score'] = (expression_data_key_transcripts['log10_tpm'] -  expression_data_key_transcripts['target_mean_log10_tpm']) / expression_data_key_transcripts['target_StdDev_log10_tpm']

#Mutiply z-score by coefficients
expression_data_key_transcripts['weighting'] = expression_data_key_transcripts['z_score'] * expression_data_key_transcripts['coefficient']

# Sum weightings
weightings = (expression_data_key_transcripts
              .loc[:, ["Accession", "weighting"]]
              .groupby(by='Accession')
              .sum()
             )

weightings['Accession'] = weightings.index
weightings = weightings.reset_index(drop=True)

#Add intercept value
weightings["weighting_plus_intercept"] = weightings["weighting"] + intercept

#Calculate probability
#P(t) = 1 / (1 + e^(-t))
weightings["LogReg_p(differentiated)"] = 0 - weightings["weighting_plus_intercept"]
weightings["LogReg_p(differentiated)"] = weightings["LogReg_p(differentiated)"].apply(lambda x: math.exp(x))
weightings["LogReg_p(differentiated)"] = 1 / (1 + weightings["LogReg_p(differentiated)"])

weightings["Differentiated"] = weightings["LogReg_p(differentiated)"] >= 0.5
weightings["Differentiated"] = weightings["Differentiated"].astype(int)

weightings = (weightings
               .loc[:, ["Accession", "LogReg_p(differentiated)", "Differentiated"]]
             )

results = weightings
del(weightings)
del(expression_data_key_transcripts)

results = results.sort_values(by="LogReg_p(differentiated)", axis=0, ascending=False)

In [None]:
#Plot results
plt.figure(figsize=(10, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Accession", 
                 y="LogReg_p(differentiated)",
                 color="teal",
                 data=results)

plt.xticks(rotation=90)
plt.ylim(0, 1)
plt.axhline(y=0.5, color='black', linestyle='--')

if(results.shape[0] > 50):    #Show axis labels for smaller datasets
    plt.tick_params(labelbottom=False) 

plt.show()

In [None]:
# Write out the results
results_file = "run_classification_results.tsv.gz"
print("Writing results to: " + results_file)
results.to_csv(results_file, index=False, compression='gzip', sep="\t")

## Prediction Stability

The build_logistic_regression_classifier.ipynb Jupyter Notebook performs multiple logistic regressions on subsets of the training data, to determine the stability of a prediction.  The resulting coefficients from these classifications are written to an output file.  Here, these coefficients (and intercepts) from multiple logistic regression are read in and used to perform muliple logistic regressions on the new datasets of interest.

In [None]:
# Import coefficients
print("Reading in multiple coefficients file: " + collated_coefficients_file)
collated_coefficients = pd.read_csv(collated_coefficients_file, sep="\t")

In [None]:
#Identify the logistic regression groups
logistic_regression_iteration_ids = (collated_coefficients['logistic_regression_iteration']
                                     .drop_duplicates()
                                    )

In [None]:
# Extract intercepts
intercepts = collated_coefficients[collated_coefficients['target_id'] == 'INTERCEPT']
collated_coefficients = collated_coefficients[collated_coefficients['target_id'] != 'INTERCEPT']

In [None]:
# Merge collated coefficients with expression data
# Doing this once is most likely more efficient than perfomring a merge 1000s of times
collated_coefficient_transcripts = (collated_coefficients
                                       .loc[:, "target_id"]
                                       .drop_duplicates()
                                      )

expression_transcripts = (expression_data
                            .loc[:, "target_id"]
                            .drop_duplicates()
                          )

not_found_coefficients = collated_coefficient_transcripts[collated_coefficient_transcripts.isin(expression_transcripts)==False]

if(not_found_coefficients.size == 0):
    print("Good news: all collated logistic regression coefficients found in expression data")
    expression_data = pd.merge(expression_data, collated_coefficients, how="inner", on="target_id")
else:
    print("Warning: coefficients missing in input expression data:")
    print(not_found_coefficients)


In [None]:
# Calculate P(differentiated) for each accession for each logistic regression iteraction
collated_results = pd.DataFrame()    #Uninitialised dataframe

for i in logistic_regression_iteration_ids:
    expression_data_of_interest = (expression_data
                                        .query("logistic_regression_iteration == @i")
                                        .reset_index()
     )
        
    # Calculate Z-scores using the mean and stdev from the pre-computed logistric regression model
    expression_data_of_interest['z_score'] = expression_data_of_interest['log10_tpm'] - expression_data_of_interest['target_mean_log10_tpm']
    expression_data_of_interest['z_score'] = expression_data_of_interest['z_score'] / expression_data_of_interest['target_StdDev_log10_tpm']
    
    #Mutiply z-score by coefficients
    expression_data_of_interest['weighting'] = expression_data_of_interest['z_score'] * expression_data_of_interest['coefficient']

    # Sum weightings
    weightings = (expression_data_of_interest
                  .loc[:, ["Accession", "weighting"]]
                  .groupby(by='Accession')
                  .sum()
                 )

    weightings = weightings.reset_index()

    #Add intercept value
    weightings["weighting_plus_intercept"] = weightings["weighting"] + intercept

    #Calculate probability
    #P(t) = 1 / (1 + e^(-t))
    weightings["LogReg_p(differentiated)"] = 0 - weightings["weighting_plus_intercept"]
    weightings["LogReg_p(differentiated)"] = weightings["LogReg_p(differentiated)"].apply(lambda x: math.exp(x))
    weightings["LogReg_p(differentiated)"] = 1 / (1 + weightings["LogReg_p(differentiated)"])

    weightings["Differentiated"] = weightings["LogReg_p(differentiated)"] >= 0.5

    weightings = (weightings
                   .loc[:, ["Accession", "LogReg_p(differentiated)", "Differentiated"]]
                 )

    stability_results = weightings
    del(weightings)

    stability_results = stability_results.sort_values(by='Accession', axis=0, ascending=False)
    collated_results = collated_results.append(stability_results)

In [None]:
# Manipulate the collated results and incorporate the output logistic regression result from before
collated_results = (collated_results
        .loc[:, ['Accession', 'Differentiated']]
        .assign(Mean_Differentiated=collated_results['Differentiated'].astype(int))
        .loc[:, ['Accession', 'Mean_Differentiated']]
        .groupby(by='Accession')
        .mean()
        .reset_index()
    )

collated_results = pd.merge(results, collated_results, how='left', on='Accession')

In [None]:
# Plot the stability results
plt.figure(figsize=(14, 10)) 

plt.scatter(x=collated_results['LogReg_p(differentiated)'], 
            y=collated_results['Mean_Differentiated'],
           )
plt.axhline(y=0.5, color='r', linestyle='--')
plt.axvline(x=0.5, color='r', linestyle='--')
plt.xlim(-0.1, 1.1)
plt.ylim(-0.1, 1.1)
plt.xlabel('Logistic Regression p(differentiated)')
plt.ylabel('Proporton Differentiated')
plt.show()

In [None]:
#Write out the result
stability_analysis_file = 'run_classification_stability_results.tsv.gz'
print("Writing results to: " + stability_analysis_file)
collated_results.to_csv(stability_analysis_file, index=False, compression='gzip', sep="\t")