## Step 1:
Let the program know where to find your tools file.

In [1]:
import sys, os
sys.path.append(os.path.join(os.path.realpath(".."), "Tools"))

## Step 2:
Load up all of the necessary packages

In [2]:
from plastid import BAMGenomeArray,GTF2_TranscriptAssembler,Transcript
import numpy as np
import random
import pandas as pd
from plastid.plotting.plots import *
import utilities as utils
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
import math

%matplotlib inline

## Step 3:
Define the paths to our reference genome and annotation files as well as the path to our data for the Autism Spectrum Disorder (ASD) genes identified by Sfari, the top 200 genes whose regulation is altered by FMRP as determined by RiboDiff, The list of genes and their associated TPM values (used to select for genes with sufficient TPM), and the list of primary transcript isoforms provided by APRIS.

In [3]:
reference_path = "/home/keeganfl/Desktop/Work_Fall_2021/Protocol_test/genome/dmel/"
data_path = "/home/keeganfl/Desktop/measuring_binding_domains/data/"
counts_path = "/home/keeganfl/Desktop/measuring_binding_domains/Extra_tables/TE_tables/"

## Step 4: 
load the transcript files so that we can find the sequence of each gene

In [4]:
# Loading up the transcrip[t information.
transcripts = list(GTF2_TranscriptAssembler(open(reference_path + "Drosophila_melanogaster.BDGP6.32.103.gtf"),return_type=Transcript))

## Step 5:
Create a table that lists all genes and their associated lengths. 

In [5]:
lengths_cds = []
gene_name = []
transcript_id = []
gene_id = []
for tran in transcripts:
    try:
        tran.attr["gene_name"]
    except:
        continue
    try:
        lengths_cds.append(tran.cds_end - tran.cds_start)
    except:
        continue
    gene_name.append(tran.attr["gene_name"])
    gene_id.append(tran.attr["gene_id"])
    transcript_id.append(tran.attr["transcript_id"])
all_genes = pd.DataFrame(list(zip(gene_name, gene_id, transcript_id, lengths_cds)))
all_genes.columns = ["gene_name", "gene_id","transcript_id", "lengths_cds"]

## Step 6:
Load up the list of APRIS genes. This list will include the transcript IDs of all of the primary isoforms, and will be used to select a single transcript for all genes.

In [6]:
# Load up the Apris genes. 
prin_trans = pd.read_csv(data_path + "apris_fly_data.txt", names = ["gene_name", "???", "gene_id", "rank"])

# Remove the extra decimal point from the gene ids so that the gene IDs are the same. 
gene_ids = [] 
for i in prin_trans.gene_id:
    gene_ids.append(i.split('.',1)[0])
prin_trans.gene_id = gene_ids

## Step 7:
Merge the list of APRIS genes with the list of all gene lengths in order to remove the non-primary isoforms. 

In [7]:
# Merge the count table with the table of APRIS genes
prime_genes = pd.merge(all_genes, prin_trans, how="left", left_on="transcript_id", right_on="gene_id", indicator = True)

# Only keep counts with a transcript ID that matched one of the APRIS primary IDs.
prime_genes = prime_genes.loc[prime_genes._merge == "both"].copy()

# Drop any duplicate genes (duplicates may remain if multiple transcripts have identical cds regions).
prime_genes.drop_duplicates(subset ="gene_name_x",keep = "first", inplace = True) 

# Remove unnecesary columns. 
prime_genes.drop(columns = ["gene_name_y", "???", "rank", "_merge"], inplace = True)

## Step 8:
Load up a list containing all of the autism spectrum genes and a seperate list containing top 200 genes with significantly altered gene expression. 

In [8]:
asd_gene_names = pd.read_csv(data_path + "asd_genes_fly.csv", names = ["gene_name"])
top_200 = pd.read_csv(data_path+ "dmel_top_200.csv")

## Step 9:
Extract the long genes from our list of genes and then filter out all of the autism genes from this list. 

In [80]:
# Define long genes as genes with a cds region above 2000 base pairs. 
long_genes = prime_genes.query("lengths_cds > 5400")

In [81]:
# Merge the long genes with the asd_gene_names 
temp_df = pd.merge(long_genes, asd_gene_names, how="left", right_on="gene_name", left_on="gene_name_x", indicator = True)
# reset the indices.
temp_df = temp_df.reset_index(drop=True)
# Remove genes that match to the asd_gene_names dataframe. 
long_non_asd_genes = temp_df.loc[temp_df._merge == "left_only"].copy()
long_non_asd_genes.drop(columns = ["gene_name"], inplace = True)

## Step 10:
merge the long genes list with a list of TPM genes and the select only those genes that have a TPM above 1. This is done to ensure that rarely expressed genes do not interfere with our analysis.

In [82]:
tpm_table = pd.read_csv(counts_path + "TE_fly.csv")

In [83]:
# Merge the tables
temp_df = pd.merge(long_non_asd_genes, tpm_table, how = "inner", left_on="gene_id_x", right_on="Entry")

# Drop the many uneccessary columns 
temp_df = temp_df.drop(['Entry', 'RbCtlR1', 'RbCtlR2', 'RbCtlR3', 'RbCtlR4', 'RbTrtR1',
       'RbTrtR2', 'RbTrtR3', 'RbTrtR4', 'RbTrtR5', 'RbTrtR6', 'RbTrtR7',
       'RnaCtlR1', 'RnaCtlR2', 'RnaCtlR3', 'RnaTrtR1', 'RnaTrtR2', 'RnaTrtR3',
       'RnaTrtR4', 'RnaTrtR5', 'RnaTrtR6', 'RbCtlR1_tpm', 'RbCtlR2_tpm',
       'RbCtlR3_tpm', 'RbCtlR4_tpm', 'RbTrtR1_tpm', 'RbTrtR2_tpm',
       'RbTrtR3_tpm', 'RbTrtR4_tpm', 'RbTrtR5_tpm', 'RbTrtR6_tpm',
       'RbTrtR7_tpm', 'RnaCtlR1_tpm', 'RnaCtlR2_tpm', 'RnaCtlR3_tpm',
       'RnaTrtR1_tpm', 'RnaTrtR2_tpm', 'RnaTrtR3_tpm', 'RnaTrtR4_tpm',
       'RnaTrtR5_tpm', 'RnaTrtR6_tpm', 'Ctl_TE', 'Trt_TE', 'FldChng_TE'], axis = 1)

# Filter by the average TPM
LSEnA_genes = temp_df.query("RbCtl_avg_tpm > 1 & RnaCtl_avg_tpm > 1")

## Step 11: 
Randomly select from our long, sfficiently expressed, non-ASD genes. The number of genes selected should be equal to the number of asd genes that we are using. 

In [84]:
random.seed(35)
rn_genes = pd.DataFrame(random.sample(list(LSEnA_genes.gene_name_x),len(asd_gene_names.gene_name)))

In [85]:
rn_genes.columns = ["gene_ID"]

## Step 12:
Merge the table of random long genes and ASD genes with the top 200 FMRP regulated genes using merge. This will allow us to see how many matches there are between the different tables. 

In [86]:
# Merge the tables of random long genes and ASD genes with the top 200 translationally altered genes. 
top_200_rn = pd.merge(LSEnA_genes, top_200, how="inner", right_on = "gene_ID", left_on = "gene_name_x")
top_200_asd = pd.merge(asd_gene_names, top_200, how = "inner", right_on = "gene_ID", left_on = "gene_name")

# Drop any duplicates that may have come into 
top_200_rn.drop_duplicates(inplace = True)
top_200_asd.drop_duplicates(inplace = True)

In [87]:
len(top_200_asd.gene_name)

4

In [88]:
len(top_200_rn.gene_ID)

39

## Step 13:
Perform Fisher's exact test to determine if the probability of a translationally altered FMRP gene being an ASD gene is significantly different from it being a long gene. 

In [89]:
table = [[len(top_200_asd.gene_name), len(asd_gene_names.gene_name)],[len(top_200_rn.gene_ID), len(LSEnA_genes.gene_name_x)]]

In [90]:
table

[[4, 158], [39, 256]]

In [91]:
stats.fisher_exact(table)

(0.166179811749432, 7.832442446726226e-05)