In [None]:
# Run Logistic Regression Classifier Model to Predict Expression Status

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import math

## Setup(edit as required)

In [None]:
# Setup (edit as required)
expression_datafile = 'input.tsv.gz'
differentiation_threshold = 0.2  #Set in paper
coefficents_file = 'logistic_regression_coefficients.tsv'

In [None]:
## Data overview and QC

In [None]:
# Read in data
expression_data = pd.read_csv(expression_datafile, sep="\t")
print("Reading in: " + expression_datafile)
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

In [None]:
# Log10 tpm histogram
plt.figure(figsize=(8,8))
plt.hist(expression_data['log10_tpm'], bins=100)
plt.xlabel('Log10(tpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plot=sns.ecdfplot(data=expression_data, 
                  x="log10_tpm", 
                  hue="Accession",
                 legend=False)

In [None]:
# Since we are using an already established logistic regression model, the expression
# DataFrame my be simplified to only include relevant data
expression_data = (expression_data
        .loc[:, ['Accession', 'target_id', 'Cell_line', 'log10_tpm']]
                  )

In [None]:
#Create a simplified file in user-friendly format for analysis in other tools (e.g. R, Excel)

#Log10(TPM+1)
data_for_external_analysis = expression_data
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Accession'] + "_" + data_for_external_analysis['Cell_line']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'log10_tpm']]
        .pivot(index="target_id", columns='Cell_Sample', values='log10_tpm')
    )

data_for_external_analysis['Cell_Sample'] = data_for_external_analysis.index
first_column = data_for_external_analysis.pop('Cell_Sample')
data_for_external_analysis.insert(0, 'Cell_Sample', first_column)


#Write out the result
external_analysis_file = 'external_analysis_data_log10_tpm.tsv.gz'
print("Writing results to: " + external_analysis_file)
data_for_external_analysis.to_csv(external_analysis_file, index=False, compression='gzip', sep="\t")

del(data_for_external_analysis)
del(first_column)

## Run Logistic Regression Model

In [None]:
# Import coefficients
print("Reading in coefficients file: " + coefficents_file)
coefficients = pd.read_csv(coefficents_file, sep="\t")

In [None]:
# Extract intercept
if(coefficients.loc[0, 'target_id'] != 'INTERCEPT'):
    print("Intercept not found in " + coefficents_file)
intercept = coefficients.loc[0, 'coefficient']
    
coefficients.drop(axis=0, index=0, inplace=True)    


In [None]:
# Merge Coefficients data with expression data
coefficient_transcripts = (coefficients
                               .loc[:, "target_id"]
                               .drop_duplicates()
                          )

expression_transcripts = (expression_data
                            .loc[:, "target_id"]
                            .drop_duplicates()
                          )

not_found_coefficients = coefficient_transcripts[coefficient_transcripts.isin(expression_transcripts)==False]

if(not_found_coefficients.size == 0):
    print("Good news: all logistic regression coefficients found in expression data")
    expression_data = pd.merge(expression_data, coefficients, how="inner", on="target_id")
else:
    print("Warning: coefficients missing in input expression data:")
    print(not_found_coefficients)

    

In [None]:
# Calculate P(differentiated) for each accession

# Calculate Z-scores using the mean and stdev from the pre-computed logistric regression model
expression_data['z_score'] = (expression_data['log10_tpm'] -  expression_data['target_mean_log10_tpm']) / expression_data['target_StdDev_log10_tpm']

#Mutiply z-score by coefficients
expression_data['weighting'] = expression_data['z_score'] * expression_data['coefficient']

# Sum weightings
weightings = (expression_data
              .loc[:, ["Accession", "weighting"]]
              .groupby(by='Accession')
              .sum()
             )

weightings['Accession'] = weightings.index
weightings = weightings.reset_index(drop=True)

#Add intercept value
weightings["weighting_plus_intercept"] = weightings["weighting"] + intercept


#Calculate probability
#P(t) = 1 / (1 + e^(-t))
weightings["p(differentiated)"] = 1 - weightings["weighting_plus_intercept"]
weightings["p(differentiated)"] = weightings["p(differentiated)"].apply(lambda x: math.exp(x))
weightings["p(differentiated)"] = 1 / (1 + weightings["p(differentiated)"])

weightings["Differentiated"] = weightings["p(differentiated)"] >= 0.5

weightings = (weightings
               .loc[:, ["Accession", "p(differentiated)", "Differentiated"]]
             )

results = weightings
del(weightings)

results = results.sort_values(by="p(differentiated)", axis=0, ascending=False)

In [None]:
#Plot results
plt.figure(figsize=(10, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Accession", 
                 y="p(differentiated)",
                 color="teal",
                 data=results)
plt.ylim(0, 1)
plt.xticks(rotation=90)
plt.axhline(y=0.5, color='black', linestyle='--')
plt.show()