In [None]:
# Classifier script
# Takes as input:
#                 i) z-scores of expression
#                 ii) metadata
#                 iii) list of genes and correlation coefficients

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os.path

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

In [None]:
# Setup

# Input dataset
#expression_data_file = 'salmon.merged.gene_tpm.log2_tmp_plus_1.downsized.z_scores.tsv.gz'
expression_data_file = 'salmon.merged.gene_tpm.log2_tmp_plus_1.retention_group_filtered.z_scores.tsv.gz'


# Metadata
metadata_file = 'dataset_summary.tsv'

# Correlation file
correlation_file = 'salmon.merged.gene_tpm.log2_tmp_plus_1.retention_group_filtered.correlation.tsv.gz'

# FDR
fdr_threshold = 0.05

# Classifier iterations
classifier_iterations = 20

In [None]:
# Read in data
print("Reading in metadata: " + metadata_file)
metadata = pd.read_csv(metadata_file, sep="\t")
print(f'Metadata number of Accessions: {metadata.shape[0]}')
print()

print("Reading in expression file: " + expression_data_file)
expression_data = pd.read_csv(expression_data_file, sep="\t")
print(f'Number of cell lines: {expression_data.shape[1] - 2}')
print(f'Number of genes: {expression_data.shape[0]}')
print()

print("Reading in correlation file: " + correlation_file)
correlation_data = pd.read_csv(correlation_file, sep="\t")
print(f'Number of genes in correlation file genes: {correlation_data.shape[0]}')

In [None]:
# Z-score histogram
plot_data = (expression_data
                .iloc[:, 2:]
                .to_numpy()
                .flatten()
            )

plt.figure(figsize=(7,7))
plt.hist(plot_data, bins=100)
plt.xlabel('z-score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Cumulative distribution plot
plt.figure(figsize=(7,7))
plot=sns.ecdfplot(data=expression_data.iloc[:, 2:], legend=False)
plt.xlabel('z-score')
plt.show()

In [None]:
# Filter to obtain only correlated genes
print(f'Filtering by FDR threshold ({fdr_threshold})')

correlation_data = (correlation_data
                        .query('q <= @fdr_threshold')
                        .loc[:,'target_gene_id']
                    )

print(f'Number of correlated genes identified: {correlation_data.shape[0]}')

boolean_to_select = (expression_data
                         .loc[:, 'gene_id']
                         .isin(correlation_data)
                    )

expression_data = expression_data[boolean_to_select]
expression_data = expression_data.reset_index(drop=True)

print(f'Number of genes selected from expression file: {expression_data.shape[0]}')

In [None]:
# Z-score histogram after filtering
plot_data = (expression_data
                .iloc[:, 2:]
                .to_numpy()
                .flatten()
            )

plt.figure(figsize=(7,7))
plt.hist(plot_data, bins=100)
plt.xlabel('z-score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Cumulative distribution plot after filtering
plt.figure(figsize=(7,7))
plot=sns.ecdfplot(data=expression_data.iloc[:, 2:], legend=False)
plt.xlabel('z-score')
plt.show()

In [None]:
# Edit Metadata
metadata = metadata.loc[:, ['Accession', 'Cell_line', 'Diff_efficiency']]

# Extract accessions of interest
boolean_to_select = (metadata
                         .loc[:, 'Accession']
                         .isin(expression_data.iloc[:, 2:].columns)
                    )

metadata = metadata[boolean_to_select]

# Differentition success (score > 0.2)
metadata['Success'] = np.where(metadata['Diff_efficiency'] >= 0.2, 1, 0)
metadata = metadata.reset_index(drop=True)

In [None]:
# Make a dateframe of the accession to predict and associated success/failure training datasets
# Work through metadata selecting accession of interest and training data

training_datasets = pd.DataFrame()

for iteration in range(1, classifier_iterations + 1):
    
    if iteration % 5 == 0:
        print(f'Iteration: {iteration}')
    
    for accession_of_interest in metadata.loc[:, 'Accession']:
        #accession_of_interest = 'ERR1203463'

        # When training the classifier, do NOT include accessions from the 
        # same cell line in training set.
        cell_line_ignore = (metadata       
                                .query('Accession == @accession_of_interest')
                                .loc[:, 'Cell_line']
                                .iloc[0]
                           )
        # Select Success/Failure training sets
        # Do not use the same cell line more than once
        # Shufle so the same accession for each cell line is not used every time
        training_accessions_success = (metadata
                                           .query('Success == 1')
                                           .query('Cell_line != @cell_line_ignore')
                                           .sample(frac=1)   #Shuffle
                                           .drop_duplicates(subset='Cell_line')
                                      )

        training_accessions_fail = (metadata
                                           .query('Success == 0')
                                           .query('Cell_line != @cell_line_ignore')
                                           .sample(frac=1)   #Shuffle
                                           .drop_duplicates(subset='Cell_line')
                                      )

        # Make the training datasets equal size
        dataset_size = min(training_accessions_success.shape[0],
                           training_accessions_fail.shape[0])

        training_accessions_success = (training_accessions_success
                                           .head(dataset_size)
                                           .loc[:, 'Accession']
                                      )
        training_accessions_fail = (training_accessions_fail
                                        .head(dataset_size)
                                        .loc[:, 'Accession']
                                   )

        training_accessions_success = pd.DataFrame(training_accessions_success)
        training_accessions_success['accession_of_interest'] = accession_of_interest
        training_accessions_success['status_training_accession'] = 1
        training_accessions_success = training_accessions_success.rename(columns={'Accession': 'training_accession'})
        training_accessions_success['Iteration'] = iteration
        training_datasets = pd.concat([training_datasets, training_accessions_success], ignore_index=True)

        training_accessions_fail = pd.DataFrame(training_accessions_fail)
        training_accessions_fail['accession_of_interest'] = accession_of_interest
        training_accessions_fail['status_training_accession'] = 0
        training_accessions_fail = training_accessions_fail.rename(columns={'Accession': 'training_accession'})
        training_accessions_fail['Iteration'] = iteration
        
        training_datasets = pd.concat([training_datasets, training_accessions_fail], ignore_index=True)

In [None]:
# Run the logistic regression
logistic_regression_all_results = pd.DataFrame()
coefficients = pd.DataFrame()  #Stores coefficients for later use

for iteration in range(1, classifier_iterations + 1):
    if iteration % 5 == 0:
        print(f'Iteration: {iteration}')
        
    for target_accession in expression_data.columns.to_list()[2:]:
        #target_accession = 'ERR1203463'


        # Extract relevant the information
        target_expression = expression_data.loc[:, target_accession]
        target_success = (metadata
                            .query('Accession == @target_accession')
                            .loc[:, 'Success']
                            .iloc[0]
                        )

        training_data_accessions = (training_datasets
                                        .query('Iteration == @iteration')
                                        .query('accession_of_interest == @accession_of_interest')
                                        .loc[:, 'training_accession']
                                        .drop_duplicates()  
                                    )

        columns_to_select = training_data_accessions.to_list()
        columns_to_select = ['gene_id'] + columns_to_select
        training_expression = (expression_data
                                   .loc[:, columns_to_select]
                                   .transpose()
                              )
        training_expression.columns = training_expression.loc['gene_id', :]
        training_expression = training_expression.iloc[1:, :] # Remove row 1

        # Restructure data
        training_success =  (metadata
                             .loc[:, ['Accession', 'Success']]
                             .transpose()
                            )

        training_success.columns = metadata.loc[:, 'Accession']
        training_success = (training_success
                            .loc['Success', :]
                            .loc[training_data_accessions.to_list()]  #re-order
                            )

        # Convert to Numpy format
        training_expression = (training_expression
                                .to_numpy()
                                .astype('int')
                              )

        training_success = (training_success
                                .to_numpy()
                                .astype('int')
                           )

        target_expression = (target_expression.to_numpy()
                                .reshape(1, -1) 
                            )

        #Run logistic regression
        lreg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l1')
        lreg.fit(training_expression, training_success)

        predicted = lreg.predict(target_expression)
        expected = target_success

        predicted_prob= lreg.predict_proba(target_expression)
        predicted_prob = predicted_prob[0:, 1]

        logistic_regression_result = pd.DataFrame({ 'Iteration': iteration,
                                                    'Accession' : [target_accession],
                                                    'Expected' : target_success,
                                                    'Predicted_p_value' : predicted_prob}
                                                 )

        logistic_regression_all_results = pd.concat([logistic_regression_all_results, 
                                                     logistic_regression_result],
                                                       ignore_index=True)
        
        # Store coefficients and intercepts
        coeffs_current = pd.DataFrame(lreg.coef_[0], columns=['coefficient'])
        coeffs_current['gene_id'] = expression_data.loc[:, 'gene_id']
        coeffs_current['accession'] = target_accession
        coeffs_current['iteration'] = iteration

        coeffs_current = coeffs_current.query('coefficient != 0')  #Remove uniformative coefficients
        
        intercept_current = pd.DataFrame(
            {'coefficient' : [lreg.intercept_[0]],
             'gene_id' : ['INTERCEPT'],
             'accession' : [target_accession],
             'iteration' : [iteration]
            })
        
        coefficients = pd.concat([coefficients, intercept_current], ignore_index=True)
        coefficients = pd.concat([coefficients, coeffs_current], ignore_index=True)
        
coefficients = coefficients.loc[:, ['iteration',    #Reorder
                                     'accession',
                                     'gene_id',
                                     'coefficient'
                                    ]]

In [None]:
# Write out results
outfile = 'classifier_results.tsv.gz'
print("Writing results to: " + outfile)
logistic_regression_all_results.to_csv(outfile,
                                       index=False, 
                                       compression='gzip', 
                                       sep="\t")

outfile = 'classifier_coefficients.tsv.gz'
print("Writing coefficients to: " + outfile)
coefficients.to_csv(outfile, index=False, compression='gzip', sep="\t")

In [None]:
# Create a log of the datasets used in training the logistic regression classifier
outfile = 'classifier_training_log.tsv.gz'
print("Writing a log of the datasets used in training the logistic regression classifier: " + outfile)
training_datasets.to_csv(outfile,
                            index=False, 
                            compression='gzip', 
                            sep="\t")

In [None]:
print('Done')