# Match Transcripts to Genes
Jupyter Notebook to take a file of data which includes a 'target_id' column (corresponding to transcript IDs) and a transcript-gene lookup file (columns:'target_id' and 'gene_id'), and then generated a new file which contains the input data and a new column listing the appropiate gene names.

In [None]:
import pandas as pd

## Setup (edit as required)

In [None]:
# Setup (edit as required)
input_data_file = 'input_data.tsv.gz'
transcript_gene_lookup_file = 'transcript_gene_lookup.tsv.gz'

## Read in results

In [None]:
# Read in results
input_data = pd.read_csv(input_data_file, sep="\t")
print("Reading in: " + input_data_file)

transcript_gene_lookup = pd.read_csv(transcript_gene_lookup_file, sep="\t")
print("Reading in: " + transcript_gene_lookup_file)

In [None]:
# Check for duplicate gene names in lookup file
# A gene may have many associated transcripts, but a transcript should not
# be associated with more than one gene.
transcript_count = transcript_gene_lookup.shape[0]
transcript_unique_count = (transcript_gene_lookup
                        .loc[:, 'target_id']
                        .drop_duplicates()
                        .shape[0]
                    )

if(transcript_count != transcript_unique_count):
    print("Warning: Identical GeneIDs present multiple times in " + transcript_gene_lookup_file)

## Processing

In [None]:
not_found_lookups = (input_data
                     .loc[:, 'target_id']
                     .isin(transcript_gene_lookup['target_id'])
                    )

not_found_lookups = not_found_lookups != True

if(not_found_lookups.sum() == 0):
    print('Good news: all trancript IDs in ' + input_data_file + ' found in ' + transcript_gene_lookup_file)
else:
    print('Warning: not all trancript IDs in ' + input_data_file + ' found in ' + transcript_gene_lookup_file + ':')
    not_found_lookups = input_data[not_found_lookups].loc[:, 'target_id']
    print(not_found_lookups)
    

In [None]:
# Merge data
input_data_file = pd.merge(input_data, transcript_gene_lookup, on="target_id", how='left')

In [None]:
# Write out the results
outfile = "merged_by_transcript_gene_ids.tsv.gz"
print("Writing results to: " + outfile)
input_data_file.to_csv(outfile, index=False, compression='gzip', sep="\t")