In [1]:
import pandas as pd
import numpy as np

In [2]:
# Find longest transcript lookups
lookups_file = 'parsed_gtf.tsv.gz'
lookups = pd.read_csv(lookups_file, sep="\t")
lookups['ID'] = lookups.index

In [3]:
lookups_all = (lookups
                    #.reset_index()
                    .loc[:, 'ID']
                  )

lookups_all = pd.merge(lookups_all, lookups, on='ID')

In [4]:
# Set expression threshold
expression_threshold = 1

In [5]:
# Read in expression data
expression_datafile = 'classifier_input.tsv.gz'
expression_data = pd.read_csv(expression_datafile, sep="\t")
print("Reading in: " + expression_datafile)
print("Number of different accessions: " + str(expression_data['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data['target_id'].drop_duplicates().count()))

Reading in: classifier_input.tsv.gz
Number of different accessions: 328
Number of different cell lines: 143
Number of different transcripts: 194360


In [6]:
# Filtering
expression_data_filtered = expression_data[expression_data.target_max_log10_tpm >= expression_threshold]
expression_data_filtered =  expression_data_filtered.sort_values(by=['Accession', 'target_id'])    #Useful when re-shaping
expression_data_filtered = expression_data_filtered.reset_index()

print("Analysis using:")
print("Number of different accessions: " + str(expression_data_filtered['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data_filtered['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data_filtered['target_id'].drop_duplicates().count()))

Analysis using:
Number of different accessions: 328
Number of different cell lines: 143
Number of different transcripts: 35629


In [7]:
# Process all data

In [8]:
# Do lookup of transcripts with genes
expression_data_filtered2 = expression_data_filtered.copy()

edited_transcript_id_names = expression_data_filtered2['target_id'].str.split(".", n=1, expand=True)
edited_transcript_id_names = edited_transcript_id_names.iloc[:, 0]

expression_data_filtered2.insert(1, "target_id2", edited_transcript_id_names)
expression_data_filtered2 = expression_data_filtered2.drop(labels='target_id', axis=1)
expression_data_filtered2 = expression_data_filtered2.rename(columns={"target_id2": "target_id"})
expression_data_filtered2 = pd.merge(expression_data_filtered2, lookups_all, on='target_id', how='inner')

print("Analysis using:")
print("Number of different accessions: " + str(expression_data_filtered2['Accession'].drop_duplicates().count()))
print("Number of different cell lines: " + str(expression_data_filtered2['Cell_line'].drop_duplicates().count()))
print("Number of different transcripts: " + str(expression_data_filtered2['target_id'].drop_duplicates().count()))

Analysis using:
Number of different accessions: 328
Number of different cell lines: 143
Number of different transcripts: 34011


In [9]:
# Log10(TPM+1)
data_for_external_analysis = expression_data_filtered2.copy()
data_for_external_analysis['Cell_Sample'] = data_for_external_analysis['Cell_line'] + "_" + data_for_external_analysis['Accession']

data_for_external_analysis = (data_for_external_analysis
        .loc[:, ['Cell_Sample', 'target_id', 'log10_tpm']]
        .pivot(index="target_id", columns='Cell_Sample', values='log10_tpm')
    )

data_for_external_analysis = data_for_external_analysis.reset_index()
data_for_external_analysis = pd.merge(lookups_all, data_for_external_analysis, on='target_id', how='right')
data_for_external_analysis = data_for_external_analysis.reset_index()
data_for_external_analysis = data_for_external_analysis.drop(labels='ID', axis=1)
data_for_external_analysis = data_for_external_analysis.drop(labels='index', axis=1)
data_for_external_analysis = data_for_external_analysis.rename(columns= {'target_id' : 'transcript_id'})

data_for_external_analysis = data_for_external_analysis.sort_values(by=['gene_name', 'transcript_length'], ascending=[True, False])


In [10]:
# Write out the result for all transcripts
outfile = 'all_transcript_genes_log10_tpm.tsv.gz'
print("Writing results to: " + outfile)
data_for_external_analysis.to_csv(outfile, index=False, compression='gzip', sep="\t")

Writing results to: all_transcript_genes_log10_tpm.tsv.gz


In [11]:
data_for_external_analysis = data_for_external_analysis.reset_index(drop=True)

rows_to_select = (data_for_external_analysis
                    .loc[:, 'gene_id']
                    .drop_duplicates()
                    .index
                )

data_for_external_analysis = data_for_external_analysis.iloc[rows_to_select]
data_for_external_analysis = data_for_external_analysis.sort_values(by=['gene_name', 'transcript_length'], ascending=[True, False])

In [12]:
# Write out the result
outfile = 'longest_transcript_genes_log10_tpm.tsv.gz'
print("Writing results to: " + outfile)
data_for_external_analysis.to_csv(outfile, index=False, compression='gzip', sep="\t")

Writing results to: longest_transcript_genes_log10_tpm.tsv.gz


In [13]:
# Create a differentiation score table

In [14]:
cell_line_diff_scores = (expression_data
                         .loc[:, ['Accession', 'Cell_line', 'Jerber_model_score', 'Diff_efficiency']]
                        
                        )
cell_line_diff_scores['Cell_Sample'] = cell_line_diff_scores['Cell_line'] + "_" + cell_line_diff_scores['Accession']

cell_line_diff_scores = cell_line_diff_scores.loc[:, ['Cell_Sample', 'Diff_efficiency', 'Jerber_model_score']]

cell_line_diff_scores = cell_line_diff_scores.drop_duplicates()

In [15]:
# Make sure order is the same as data outfile
cell_line_order = data_for_external_analysis.columns
cell_line_order = cell_line_order[7:]   #Remove non-cell type data
cell_line_order = pd.DataFrame(cell_line_order, columns=['Cell_Sample'])
cell_line_diff_scores = pd.merge(cell_line_order, cell_line_diff_scores, on='Cell_Sample', how='left')
cell_line_diff_scores = cell_line_diff_scores.transpose()


In [16]:
# Write out the result
outfile = 'cell_line_sample_diff_scores.tsv'
print("Writing results to: " + outfile)
cell_line_diff_scores.to_csv(outfile, sep="\t", header=False)

Writing results to: cell_line_sample_diff_scores.tsv
