# Logistic Regression Classifier of Differentiation in Cell Lines using Expression Data

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [None]:
# Setup (edit as required)
expression_datafile = 'classifier_input.tsv.gz'
retention_groups_to_process = (0, )    #Tuple of all retention groups to include (remember trailing comma)
expression_threshold = 2.75
differentiation_threshold = 0.2  #Set in paper

## Data import, QC and summarisation

In [None]:
# Read in data
expression_data = pd.read_csv(expression_datafile, sep="\t")
print("Reading in: " + expression_datafile)
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

In [None]:
# Log10 tpm histogram
plt.figure(figsize=(8,8))
plt.hist(expression_data['log10_tpm'], bins=100)
plt.xlabel('Log10(tpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plot=sns.ecdfplot(data=expression_data, 
                  x="log10_tpm", 
                  hue="Accession",
                 legend=False)

In [None]:
# Z-score overview
plt.figure(figsize=(8,8))
plt.hist(expression_data['z_score'], bins=100)
plt.xlabel('Z-score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Filter data based on i) cell lines and ii) expression values
boolean_series = expression_data['Retention_group'].isin(retention_groups_to_process)
expression_data = expression_data[boolean_series]

expression_data = expression_data[expression_data.target_max_log10_tpm >= expression_threshold]
expression_data =  expression_data.sort_values(by=['Accession', 'target_id'])    #Useful when re-shaping

print("Analysis using:")
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

In [None]:
# Log10 histogram after filtering
plt.figure(figsize=(8,8))
plt.hist(expression_data['log10_tpm'], bins=100)
plt.xlabel('Log10(tpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Cumulative Distribution after filtering
plt.figure(figsize=(8,8))
plot=sns.ecdfplot(data=expression_data, 
                  x="log10_tpm", 
                  hue="Accession",
                 legend=False)

In [None]:
plt.figure(figsize=(8,8))
plt.hist(expression_data['z_score'], bins=100)
plt.xlabel('Z-score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Add a column that specifies whether an accession has undergone differentiation
expression_data['Differentiated'] = np.where(expression_data['Diff_efficiency'] > differentiation_threshold, 1, 0)

In [None]:
#Create a simplified file in user-friendly format for analysis in other tools (e.g. R, Excel)

#Log10(TPM+1)
data_for_external_analysis = expression_data
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Accession'] + "_" + data_for_external_analysis['Cell_line']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'log10_tpm']]
        .pivot(index="target_id", columns='Cell_Sample', values='log10_tpm')
    )


data_for_external_analysis['Cell_Sample'] = data_for_external_analysis.index
first_column = data_for_external_analysis.pop('Cell_Sample')
data_for_external_analysis.insert(0, 'Cell_Sample', first_column)


#Write out the result
external_analysis_file = 'external_analysis_data_log10_tpm.tsv.gz'
print("Writing results to: " + external_analysis_file)
data_for_external_analysis.to_csv(external_analysis_file, index=False, compression='gzip', sep="\t")

#Z-scores
data_for_external_analysis = expression_data
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Accession'] + "_" + data_for_external_analysis['Cell_line']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'z_score']]
        .pivot(index="target_id", columns='Cell_Sample', values='z_score')
    )

data_for_external_analysis['Cell_Sample'] = data_for_external_analysis.index
first_column = data_for_external_analysis.pop('Cell_Sample')
data_for_external_analysis.insert(0, 'Cell_Sample', first_column)

#Write out the result
external_analysis_file = 'external_analysis_data_z_score.tsv.gz'
print("Writing results to: " + external_analysis_file)
data_for_external_analysis.to_csv(external_analysis_file, index=False, compression='gzip', sep="\t")

del(data_for_external_analysis)
del(first_column)

## Logistic Regression

In [None]:
# Use seaborn to plot the class distribution
plt.figure(figsize=(8,8))
sns.countplot(x='Differentiated', 
              data=expression_data[['Accession', 'Differentiated']].drop_duplicates())
plt.title('Class distribution: 0=Undifferentiated, 1=Differentiated')
plt.show()

In [None]:
#Re-shape the expression data to a format usable by ML tools
#This creates a standardised datastructure / naming convention where:
# X: input parameters
# y: target (expected) results (i.e. boolean of whether accession is differentiated)
X = (expression_data
        .loc[:, ['Accession', 'target_id', 'z_score']]
        .pivot(index="Accession", columns='target_id', values='z_score')
        .to_numpy()
    )

#Determine the differentiation scores in a numpy format
y = (expression_data
        .loc[:, ['Accession', 'Differentiated']]
        .drop_duplicates()
        .loc[:, 'Differentiated']
        .to_numpy()
    )

### Leave-one-out Cross-Validation

In [None]:
#Perform logistic regression with a leave-one-out cross validation technique
cross_validation = LeaveOneOut()

In [None]:
lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
scores = cross_val_score(lreg, X, y, cv=cross_validation)

In [None]:
print("Logistic regression results:")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
#Predictions
predicted = cross_val_predict(lreg, X, y, cv=cross_validation)
predictions_probabilities = cross_val_predict(lreg, X, y, cv=cross_validation, method="predict_proba")
predictions_probabilities = predictions_probabilities[0:, 1]

In [None]:
#Confusion matrix
cm = confusion_matrix(y, predicted)

plt.figure(figsize=(10.5, 7.5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap=plt.cm.Blues, fmt='g')

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])
ax.yaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])

plt.show()

In [None]:
# Calculate accuracy
acc = accuracy_score(y, predicted)
print('Accuracy: ', acc)

# Calculate Cohen's Kappa score
cka = cohen_kappa_score(y, predicted)
print('Cohen\'s Kappa: ', cka)

In [None]:
# Generate a ROC AUC plot
fpr, tpr, threshold = metrics.roc_curve(y, predicted)
roc_auc = metrics.auc(fpr, tpr)

plt.figure(figsize=(8,8))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label = 'AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1],'r--', label = 'Random chance')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
# Generate a PR plot
precision, recall, _ = metrics.precision_recall_curve(y, predicted)

plt.figure(figsize=(8, 8))
plt.step(recall, precision, where='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.show()

In [None]:
#Plot differentition efficiency against predicted values
plt.figure(figsize=(14, 10)) 
real_differentiation_scores = (expression_data
                                .loc[:, ['Accession', 'Diff_efficiency', 'Differentiated']]
                                .drop_duplicates()
                              )

plt.scatter(x=real_differentiation_scores['Diff_efficiency'], 
            y=predictions_probabilities,
            c=real_differentiation_scores['Differentiated']
           )
plt.axhline(y=0.5, color='r', linestyle='--')
plt.axvline(x=0.2, color='r', linestyle='--')
plt.xlabel('Real Differentiation Score')
plt.ylabel('Classifier: p(differentiated)')
plt.show()

### Logistic regression using all datasets as training data

In [None]:
#Run logistic regression on all the non-retained datasets
lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
lreg.fit(X, y)

In [None]:
#Output the complete results
output_results = (expression_data
                  .loc[:, ['Accession', 'Cell_line', 'Diff_efficiency', 'Differentiated', 'Retention_group']]
                  .drop_duplicates()
                  .reset_index(drop=True)
                 )

predicted = pd.DataFrame(lreg.predict(X), columns=["LogReg_Differentiated"])
predicted_proba = pd.DataFrame(lreg.predict_proba(X), columns=["", "LogReg_p(differentiated)"])
predicted_proba = predicted_proba.iloc[:, 1]      
output_results = pd.concat([output_results, predicted, predicted_proba], axis=1)
 
output_file = 'classification_results_trained_on_all_data.tsv.gz'
print("Writing results to: " + output_file)
output_results.to_csv(output_file, index=False, compression='gzip', sep="\t")
del(predicted, predicted_proba, output_file)

In [None]:
influential_target_ids = expression_data.loc[:, 'target_id'].drop_duplicates().reset_index(drop=True)
influential_coefficients = pd.Series(lreg.coef_[0]).reset_index(drop=True)
influential_coefficients.name = "coefficient"

influential_coefficients = pd.concat([influential_target_ids, influential_coefficients], axis=1)
influential_coefficients = influential_coefficients[influential_coefficients.coefficient > 0]

influential_coefficients = pd.merge(influential_coefficients, expression_data, how="left", on="target_id")
influential_coefficients = (influential_coefficients
    .loc[:, ['target_id', 'target_mean_log10_tpm', 'target_StdDev_log10_tpm', 'coefficient']]
     .drop_duplicates()
    ) 
      
del(influential_target_ids)

In [None]:
# Append the intercept to the first row of the coefficients dataframe
intercept_row =  pd.DataFrame({'target_id' : 'INTERCEPT', 
                 'target_mean_log10_tpm' : 'NA', 
                 'target_StdDev_log10_tpm' : 'NA', 
                 'coefficient' : lreg.intercept_
                }, index=[1]
)

influential_coefficients = pd.concat([intercept_row, influential_coefficients], ignore_index=True)

In [None]:
# Write out the coefficients results
coefficents_file = 'logistic_regression_coefficients.tsv.gz'
print("Writing results to: " + coefficents_file)
influential_coefficients.to_csv(coefficents_file, index=False, compression='gzip', sep="\t")

In [None]:
#Confusion matrix
cm = confusion_matrix(y, lreg.predict(X))

plt.figure(figsize=(7, 5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap=plt.cm.Blues, fmt='g')

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])
ax.yaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])

plt.show()

### Model stability

In [None]:
# To provide an assessment of the stability of the model, logistic_regression is 
# re-run using 90% training data and 10% validation 
# (randomly selecting the data 10 * 10 = 100 times).  In what proportion of the 
# datasets is an accession classified as differentiated?

# Randomly assign the Accessions to 1 of 10 groups
number_of_interations = 5
number_of_groups = 2
results_collated = pd.DataFrame(columns=["Accession", "Differentiated"])    #Uninitialised

shuffle_accessions = expression_data

for i in (range(number_of_interations)):
    print(f"Iteration {i + 1}")
    
    shuffle_accessions = (shuffle_accessions
                        .loc[:, "Accession"]
                        .drop_duplicates()
                     )
    
    shuffle_accessions = (shuffle_accessions
                            .sample(frac=1)
                            .reset_index(drop=True)
                         )

    # Recycle 0...9 to allocate each accession to a groups
    groups = np.array(range(number_of_groups))
    groups = pd.DataFrame(np.resize(groups, len(shuffle_accessions.index)), columns=["Group"])
    shuffle_accessions = pd.concat([shuffle_accessions, groups], axis=1)

    #Create training and test NumPy arrays
    for test_group in range(number_of_groups):
        train_accessions = shuffle_accessions[shuffle_accessions['Group'] != test_group]
        test_accessions = shuffle_accessions[shuffle_accessions['Group'] == test_group]

        X_train = (pd.merge(train_accessions, expression_data, how="left", on="Accession")
                                .loc[:, ['Accession', 'target_id', 'z_score']]
                                .pivot(index="Accession", columns='target_id', values='z_score')
                                .sort_index(axis=1)
                                .to_numpy()
                            )

        y_train = (pd.merge(train_accessions, expression_data, how="left", on="Accession")
                                .loc[:, ['Accession', 'Differentiated']]
                                .drop_duplicates()
                                .sort_values(by=['Accession'])
                                .loc[:, 'Differentiated']
                                .to_numpy()
                             )

        X_test = (pd.merge(test_accessions, expression_data, how="left", on="Accession")
                                .loc[:, ['Accession', 'target_id', 'z_score']]
                                .pivot(index="Accession", columns='target_id', values='z_score')
                                .sort_index(axis=1)
                                .to_numpy()
                            )

        y_test = (pd.merge(test_accessions, expression_data, how="left", on="Accession")
                                .loc[:, ['Accession', 'Differentiated']]
                                .drop_duplicates()
                                .sort_values(by=['Accession'])
                                .loc[:, 'Differentiated']
                                .to_numpy()
                             )

        lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
        lreg.fit(X_train, y_train)
        predicted = pd.DataFrame(lreg.predict(X_test), columns=["Differentiated"])

        test_accessions = test_accessions.reset_index(drop=True)
        results = pd.concat([test_accessions['Accession'], pd.DataFrame(predicted)], axis=1)
        results_collated = results_collated.append(results)

results_collated["Differentiated"] = results_collated["Differentiated"].astype(int)
results_collated = results_collated.groupby(by="Accession").mean()
results_collated = results_collated.rename(columns={"Differentiated" : "Mean_Differentiated"})

#xpression_data.loc[:, ['Accession', 'Differentiated']].drop_duplicates().head()
lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
lreg.fit(X_train, y_train)

actual = y_test
predicted = lreg.predict(X_test)

#Confusion matrix
cm = confusion_matrix(actual, predicted)

plt.figure(figsize=(7, 5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap=plt.cm.Blues, fmt='g')

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])
ax.yaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])

plt.show()

In [None]:
# To provide an assessment of the stability of the model, logistic_regression is 
# re-run using 90% training data and 10% validation 
# (randomly selecting the data 10 * 10 = 100 times).  In what proportion of the 
# datasets is an accession classified as differentiated?

# Randomly assign the Accessions to 1 of 10 groups
number_of_interations = 100
number_of_groups = 10
results_collated = pd.DataFrame(columns=["Accession", "Differentiated"])    #Uninitialised

#List of accessions
shuffled_accessions = expression_data['Accession'].drop_duplicates().reset_index(drop=True)

for i in (range(number_of_interations)):
    if((i + 1) % 5 == 0):
        print(f"Iteration {i + 1}")

    #shuffled_accessions = shuffled_accessions[::-1]
    shuffled_accessions = (shuffled_accessions
                            .sample(frac=1)
                            .reset_index(drop=True)    #Required, else identical accessions chosen each time
                           )

    # Divide accession into 10 approximately equally sized groups, 1 of which is the test group
    accession_groups = np.array(range(number_of_groups))
    accession_groups = pd.DataFrame(np.resize(accession_groups, len(shuffle_accessions.index)), columns=["Group"])


    #Identify train/test data
    for test_group in range(number_of_groups):
        train_accessions = shuffled_accessions.loc[accession_groups["Group"] != test_group]
        test_accessions = shuffled_accessions.loc[accession_groups["Group"] == test_group] 

        train_data = pd.merge(train_accessions, expression_data, how="left", on="Accession")
        train_data = train_data.loc[:, ['Accession', 'Differentiated', 'target_id', 'z_score']]
        test_data = pd.merge(test_accessions, expression_data, how="left", on="Accession")
        test_data = test_data.loc[:, ['Accession', 'Differentiated', 'target_id', 'z_score']]

        #Convert to NumPy format for logistic regression classifier
        X_train = (train_data
                        .loc[:, ['Accession', 'target_id', 'z_score']]
                        .pivot(index="Accession", columns='target_id', values='z_score')
                        .sort_index(axis=1)
                        .to_numpy()
                  )

        y_train = (train_data
                        .loc[:, ['Accession', 'Differentiated']]
                        .drop_duplicates()
                        .sort_values(by=['Accession'])
                        .loc[:, 'Differentiated']
                        .to_numpy()
                    )

        X_test = (test_data
                        .loc[:, ['Accession', 'target_id', 'z_score']]
                        .pivot(index="Accession", columns='target_id', values='z_score')
                        .sort_index(axis=1)
                        .to_numpy()
                  )

        y_test = (test_data
                        .loc[:, ['Accession', 'Differentiated']]
                        .drop_duplicates()
                        .sort_values(by=['Accession'])
                        .loc[:, 'Differentiated']
                        .to_numpy()
                    )

        lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
        lreg.fit(X_train, y_train)
        predicted = pd.DataFrame(lreg.predict(X_test), columns=["Differentiated"])

        test_accessions = (test_accessions
                           .sort_values()
                           .reset_index(drop=True)
                          )  #It was sorted previouly
        
        results = pd.concat([test_accessions, pd.DataFrame(predicted)], axis=1)
        results_collated = results_collated.append(results)        

results_collated["Differentiated"] = results_collated["Differentiated"].astype(int)
results_collated = results_collated.groupby(by="Accession").mean()
results_collated = results_collated.rename(columns={"Differentiated" : "Mean_Differentiated"})

In [None]:
# Plot the p(differentiated) of all data vs stability 
pdiff_vs_stability = pd.merge(output_results, results_collated, how="inner", on="Accession")

plt.figure(figsize=(14, 10)) 

plt.scatter(x=pdiff_vs_stability['LogReg_p(differentiated)'], 
            y=pdiff_vs_stability['Mean_Differentiated'],
            c=pdiff_vs_stability['Differentiated']
           )
plt.axhline(y=0.5, color='r', linestyle='--')
plt.axvline(x=0.5, color='r', linestyle='--')
plt.xlim(-0.1, 1.1)
plt.ylim(-0.1, 1.1)
plt.xlabel('Logistic Regression p(differentiated)')
plt.ylabel('Proporton Differentiated')
plt.show()