In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os.path

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
# Setup
sns.set_palette("colorblind")

# Rest
results_file = 'classifier_results.tsv.gz'

# Metadata
metadata_file = 'dataset_summary.tsv'

In [None]:
# Read in data
print("Reading in classifier results file: " + results_file)
full_results = pd.read_csv(results_file, sep="\t")
print()

print("Reading in metadata: " + metadata_file)
metadata = pd.read_csv(metadata_file, sep="\t")
print(f'Metadata number of Accessions: {metadata.shape[0]}')
print()

In [None]:
#Determine the mean p-values and standard deviations

grouped_results = (full_results
                    .loc[:, ['Accession', 'Predicted_p_value']]
                    .groupby(by='Accession')
                  )

mean_p_values = (grouped_results
                    .mean()
                    .rename(columns={'Predicted_p_value': 'Mean_predicted_p_value'})
                )

mean_p_values.loc[:, 'Accession'] = mean_p_values.index
mean_p_values = mean_p_values.reset_index(drop=True)

std_p_values = (grouped_results
                    .std(ddof=0)
                    .rename(columns={'Predicted_p_value': 'STD_predicted_p_value'})
                )

std_p_values.loc[:, 'Accession'] = std_p_values.index
std_p_values = std_p_values.reset_index(drop=True)


results = (full_results
               .query('Iteration == 1')
               .loc[:, ['Accession', 'Expected']]
          )

results = pd.merge(results, mean_p_values, how='inner')
results = pd.merge(results, std_p_values, how='inner')

del(grouped_results)

In [None]:
# Determine the number of TP, TN, FP, FN
p_value_threshold = 0.5
results['Predicted'] = np.where(results['Mean_predicted_p_value'] > p_value_threshold, 1, 0)

for i, row in results.iterrows():
    #print(results.loc[i, 'Expected'] + results.loc[i, 'Predicted'])
    if results.loc[i, 'Expected'] == 0:
        if results.loc[i, 'Predicted'] == 0:
            results.loc[i, 'Accuracy'] = 'TN'
        else:
            results.loc[i, 'Accuracy'] = 'FP'
    else:
        if results.loc[i, 'Predicted'] == 0:
            results.loc[i, 'Accuracy'] = 'FN'
        else: 
            results.loc[i, 'Accuracy'] = 'TP'
          

In [None]:
#Confusion matrix
cm = confusion_matrix(results['Expected'], results['Predicted'])

plt.figure(figsize=(7,5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap=plt.cm.Blues, fmt='g')

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])
ax.yaxis.set_ticklabels(['Undifferentiated', 'Differentiated'])

plt.show()

In [None]:
# Calculate accuracy
acc = accuracy_score(results['Expected'], results['Predicted'])
print('Accuracy: ', acc)
print()

# Calculate Cohen's Kappa score
cka = cohen_kappa_score(results['Expected'], results['Predicted'])
print('Cohen\'s Kappa: ', cka)
print()
print('Cohen suggested the Kappa result be interpreted as follows:')
print('values ≤ 0: no agreement')
print('0.01 – 0.20: none to slight')
print('0.21 – 0.40: fair')
print('0.41 – 0.60: moderate')
print('0.61 – 0.80: substantial')
print('0.81 – 1.00: almost perfect')
print()

# F1 score
f1 = f1_score(results['Expected'], results['Predicted'])
print(f'F1 score: {f1}')
print()
print('F1 = 2 * (precision * recall) / (precision + recall)')
print('A model will obtain a high F1 score if both Precision and Recall are high')
print('A model will obtain a low F1 score if both Precision and Recall are low')
print('A model will obtain a medium F1 score if one of Precision and Recall is low and the other is high')
print()

In [None]:
# Plot a precision recall curve
column_names = ['Precision', 'Recall']
precision_recall = pd.DataFrame(columns=column_names)


p_value_thresholds = (results
                        .loc[:, 'Mean_predicted_p_value']
                        .drop_duplicates()
                        .sort_values()
                        .reset_index(drop=True)
                        .iloc[0:-1]    #Remove last value since nothing larger than this
                     )

                        
for p_value_threshold in p_value_thresholds:
    threshold_specific_prediction = np.where(results['Mean_predicted_p_value'] > p_value_threshold, 1, 0)

    precision = precision_score(results['Expected'], threshold_specific_prediction)
    recall = recall_score(results['Expected'], threshold_specific_prediction)

    precision_recall_current = pd.DataFrame([[precision, recall]], 
                                            columns=column_names
                                           )
    precision_recall = pd.concat([precision_recall, precision_recall_current],
                                ignore_index=True)

    
# Plot results
plt.figure(figsize=(7,7))
sns.lineplot(data=precision_recall, x='Recall', y='Precision')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim(0, 1.05)
plt.ylim(0, 1.05)
plt.show()

plt.figure(figsize=(7,7))
sns.lineplot(data=precision_recall, x='Recall', y='Precision')
plt.title('Precision-Recall Curve (autoscale)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
#Determine recall value with 100% precision
if precision_recall['Precision'].max() == 1:
    best_recall = (precision_recall
                       .query('Precision == 1')
                       .loc[:, 'Recall']
                       .max()
                  )
else:
    print('Precision never reaches 1')
    
print(f'100% Precision with Recall of {round(best_recall * 100, 1)}%')   

In [None]:
#Combine results with metadata
results = pd.merge(results, metadata, how='left', on='Accession')
results = results.loc[:, ['Accession',
                            'Cell_line',
                            'Diff_efficiency',
                            'Jerber_model_score',
                            'Mean_predicted_p_value',
                            'STD_predicted_p_value',
                            'Expected',
                            'Predicted',
                            'Accuracy']
                       ]                       

In [None]:
#Plot scatterplot of results vs expected
plt.figure(figsize=(7,7))
sns.scatterplot(x="Diff_efficiency", 
            y="Mean_predicted_p_value",
            hue='Accuracy',
            data=results)

plt.title('Classifier p-values vs differetiation efficiency')
plt.xlabel('Differentiation Efficiency')
plt.ylabel('Mean predicted p value')
plt.axhline(0.5, color='r', linestyle='--')
plt.axvline(0.2, color='r', linestyle='--')
plt.xlim(0, 1)
plt.ylim(0, 1)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
# Compare with Jerber results
plt.figure(figsize=(7,7))
sns.color_palette("dark")
sns.scatterplot(x="Jerber_model_score", 
            y="Mean_predicted_p_value",
            hue='Accuracy',
            data=results)

plt.title('Classifier p-values vs Jerber Model Score')
plt.xlabel('Jerber Model Score')
plt.ylabel('Mean predicted p value')
plt.axhline(0.5, color='r', linestyle='--')
plt.axvline(0.2, color='r', linestyle='--')
plt.xlim(0, 1)
plt.ylim(0, 1)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
# Show standard deviation

#Plot scatterplot of results vs expected
plt.figure(figsize=(7,7))
sns.color_palette("dark")
sns.scatterplot(x="STD_predicted_p_value", 
            y="Mean_predicted_p_value",
            hue='Accuracy',
            data=results)

plt.title('Classifier p-values vs standard deviation')
plt.xlabel('Standard deviation predicted p value')
plt.ylabel('Mean predicted p value')
plt.axhline(0.5, color='r', linestyle='--')
plt.xlim(0, 1)
plt.ylim(0, 1)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
# Write out results
outfile = 'classifier_analysis_summary_results.tsv.gz'
print(f'Writing results to: {outfile}')
results.to_csv(outfile, index=False, compression='gzip', sep="\t")

In [None]:
print('Done')