# Logistic Regression Classifier of Differentiation in Cell Lines using Expression Data

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

## Setup(edit as required)

In [None]:
# Setup (edit as required)
expression_datafile = 'classifier_input.tsv.gz'
retention_groups_to_process = (0, )    #Tuple of all retention groups to include (remember trailing comma)
expression_threshold = 2.75
differentiation_threshold = 0.2  #Set in paper

## Data overview

In [None]:
# Read in data
expression_data = pd.read_csv(expression_datafile, sep="\t")
print("Reading in: " + expression_datafile)
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

In [None]:
# Log10 tpm histogram
plt.figure(figsize=(8, 6))
plt.hist(expression_data['log10_tpm'], bins=100)
plt.xlabel('Log10(tpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plot=sns.ecdfplot(data=expression_data, 
                  x="log10_tpm", 
                  hue="Accession",
                 legend=False)

In [None]:
# Z-score overview
plt.figure(figsize=(8, 6))
plt.hist(expression_data['z_score'], bins=100)
plt.xlabel('Z-score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Filter data based on i) cell lines and ii) expression values
boolean_series = expression_data['Retention_group'].isin(retention_groups_to_process)
expression_data = expression_data[boolean_series]

expression_data = expression_data[expression_data.target_max_log10_tpm >= expression_threshold]
expression_data =  expression_data.sort_values(by=['Accession', 'target_id'])    #Useful when re-shaping

print("Analysis using:")
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

In [None]:
# Log10 histogram after filtering
plt.figure(figsize=(8, 6))
plt.hist(expression_data['log10_tpm'], bins=100)
plt.xlabel('Log10(tpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Cumulative Distribution after filtering
plot=sns.ecdfplot(data=expression_data, 
                  x="log10_tpm", 
                  hue="Accession",
                 legend=False)

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(expression_data['z_score'], bins=100)
plt.xlabel('Z-score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Add a column that specifies whether an accession has undergone differentiation
expression_data['Differentiated'] = np.where(expression_data['Diff_efficiency'] > differentiation_threshold, 1, 0)

In [None]:
#Create a simplified file in user-friendly format for analysis in other tools (e.g. R, Excel)

#Log10(TPM+1)
data_for_external_analysis = expression_data
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Accession'] + "_" + data_for_external_analysis['Cell_line']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'log10_tpm']]
        .pivot(index="target_id", columns='Cell_Sample', values='log10_tpm')
    )


data_for_external_analysis['Cell_Sample'] = data_for_external_analysis.index
first_column = data_for_external_analysis.pop('Cell_Sample')
data_for_external_analysis.insert(0, 'Cell_Sample', first_column)


#Write out the result
external_analysis_file = 'external_analysis_data_log10_tpm.tsv.gz'
print("Writing results to: " + external_analysis_file)
data_for_external_analysis.to_csv(external_analysis_file, index=False, compression='gzip', sep="\t")


#Z-scores
data_for_external_analysis = expression_data
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Accession'] + "_" + data_for_external_analysis['Cell_line']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'z_score']]
        .pivot(index="target_id", columns='Cell_Sample', values='z_score')
    )

data_for_external_analysis['Cell_Sample'] = data_for_external_analysis.index
first_column = data_for_external_analysis.pop('Cell_Sample')
data_for_external_analysis.insert(0, 'Cell_Sample', first_column)

#Write out the result
external_analysis_file = 'external_analysis_data_z_score.tsv.gz'
print("Writing results to: " + external_analysis_file)
data_for_external_analysis.to_csv(external_analysis_file, index=False, compression='gzip', sep="\t")

del(data_for_external_analysis)
del(first_column)

## Logistic Regression

In [None]:
# Use seaborn to plot the class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Differentiated', 
              data=expression_data[['Accession', 'Differentiated']].drop_duplicates())
plt.title('Class distribution: 0=Undifferentiated, 1=Differentiated')
plt.show()

In [None]:
#Re-shape the expression data to a format usable by ML tools
X = (expression_data
        .loc[:, ['Accession', 'target_id', 'z_score']]
        .pivot(index="Accession", columns='target_id', values='z_score')
        .to_numpy()
    )

#Determine the differentiation scores in a numpy format
y = (expression_data
        .loc[:, ['Accession', 'Differentiated']]
        .drop_duplicates()
        .loc[:, 'Differentiated']
        .to_numpy()
    )

In [None]:
# Split the data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#Perform Logistic Regression
lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
lreg.fit(X_train, y_train)

predicted = lreg.predict(X_test)
expected = y_test

predicted_probs = lreg.predict_proba(X_test)
predicted_probs = predicted_probs[0:, 1]

In [None]:
# Calculate accuracy
print('Training set accuracy: ', accuracy_score(y_train, lreg.predict(X_train)))
acc = accuracy_score(expected, predicted)
print('Test set accuracy: ', acc)

# Calculate Cohen's Kappa score
print('Training set Kappa: ', cohen_kappa_score(y_train, lreg.predict(X_train)))
cka = cohen_kappa_score(expected, predicted)
print('Test set Kappa: ', cka)

In [None]:
# Generate a ROC AUC plot
fpr, tpr, threshold = metrics.roc_curve(y_test, predicted)
roc_auc = metrics.auc(fpr, tpr)

plt.figure(figsize=(7,5))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label = 'AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1],'r--', label = 'Random chance')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
# Generate a PR plot
precision, recall, _ = metrics.precision_recall_curve(y_test, predicted)

plt.figure(figsize=(7,5))
plt.step(recall, precision, where='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.show()

In [None]:
#Confusion matrix
cm = confusion_matrix(expected, predicted)

plt.figure(figsize=(7,5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap=plt.cm.Blues, fmt='g')

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])
ax.yaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])

plt.show()

In [None]:
# Plot the predicted target values against the true target values.
correct_prediction = expected == predicted

plt.figure(figsize=(7,5))
plt.scatter(expected, predicted_probs, c=correct_prediction)
plt.xlabel('Classification')
plt.ylabel('Probability')
plt.show()

In [None]:
plt.figure(figsize=(14,10))

graph_data_1 = pd.DataFrame({"Predicted" : predicted_probs})
graph_data_2 = pd.DataFrame({"Differentiated" : expected})

graph_data_3 = pd.concat([graph_data_1, graph_data_2], axis=1) 
graph_data_3 = graph_data_3.sort_values(by=['Predicted'])

graph_data_4 = pd.DataFrame({"Counter" : range(len(graph_data_3.index))})
graph_data_5 = pd.concat([graph_data_4, graph_data_3], axis=1) 

plt.scatter(x=graph_data_5['Counter'], y=graph_data_5['Predicted'], c= graph_data_5['Differentiated'])
#plt.xlabel('Classification')
plt.xticks([])
plt.axhline(y=0.5, color='r', linestyle='--')
plt.axhline(y=1/3, color='r', linestyle=':')
plt.axhline(y=2/3, color='r', linestyle=':')
plt.ylabel('Probability')
plt.show()