## Step 1:
Let the program know where to find your tools file.

In [15]:
import sys, os
sys.path.append(os.path.join(os.path.realpath(".."), "Tools"))

## Step 2:
Load up all of the necessary packages

In [16]:
from plastid import BAMGenomeArray,GTF2_TranscriptAssembler,Transcript
import numpy as np
import random
import pandas as pd
from plastid.plotting.plots import *
import utilities as utils
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
import math

%matplotlib inline

## Step 3:
Define the paths to our reference genome and annotation files as well as the path to our data for the Autism Spectrum Disorder (ASD) genes identified by Sfari, the top 200 genes whose regulation is altered by FMRP as determined by RiboDiff, The list of genes and their associated TPM values (used to select for genes with sufficient TPM), and the list of primary transcript isoforms provided by APRIS.

In [17]:
reference_path = "/home/keeganfl/Desktop/Work_Fall_2021/Protocol_test/genome/dmel/"
data_path = "/home/keeganfl/Desktop/measuring_binding_domains/data/"

## Step 4: 
load the transcript files so that we can find the sequence of each gene

In [18]:
# Loading up the transcrip[t information.
transcripts = list(GTF2_TranscriptAssembler(open(reference_path + "Drosophila_melanogaster.BDGP6.32.103.gtf"),return_type=Transcript))

KeyboardInterrupt: 

## Step 5:
Create a table that lists all genes and their associated lengths. 

In [None]:
lengths_cds = []
gene_name = []
gene_id = []
for tran in transcripts:
    try:
        tran.attr["gene_name"]
    except:
        continue
    try:
        lengths_cds.append(tran.cds_end - tran.cds_start)
    except:
        continue
    gene_name.append(tran.attr["gene_name"])
    gene_id.append(tran.attr["transcript_id"])
all_genes = pd.DataFrame(list(zip(gene_name, gene_id, lengths_cds)))
all_genes.columns = ["gene_name", "gene_id", "lengths_cds"]

## Step 6:
Load up the list of APRIS genes. This list will include the transcript IDs of all of the primary isoforms, and will be used to select a single transcript for all genes.

In [6]:
# Load up the Apris genes. 
prin_trans = pd.read_csv(data_path + "apris_fly_data.txt", names = ["gene_name", "???", "gene_id", "rank"])

# Remove the extra decimal point from the gene ids so that the gene IDs are the same. 
gene_ids = [] 
for i in prin_trans.gene_id:
    gene_ids.append(i.split('.',1)[0])
prin_trans.gene_id = gene_ids

## Step 7:
Merge the list of APRIS genes with the list of all gene lengths in order to remove the non-primary isoforms. 

In [7]:
# Merge the count table with the table of APRIS genes
prime_genes = pd.merge(all_genes, prin_trans, how="left", on="gene_id", indicator = True)

# Only keep counts with a transcript ID that matched one of the APRIS primary IDs.
prime_genes = prime_genes.loc[prime_genes._merge == "both"].copy()

# Drop any duplicate genes (duplicates may remain if multiple transcripts have identical cds regions).
prime_genes.drop_duplicates(subset ="gene_name_x",keep = "first", inplace = True) 

# Remove unnecesary columns. 
prime_genes.drop(columns = ["gene_name_y", "???", "rank", "_merge"], inplace = True)

## Step 8:
Load up a list containing all of the autism spectrum genes and a seperate list containing top 200 genes with significantly altered gene expression. 

In [8]:
asd_gene_names = pd.read_csv(data_path + "asd_genes_fly.csv", names = ["gene_name"])
top_200 = pd.read_csv(data_path+ "dmel_top_200.csv")

## Step 7:
Extract the long genes from our list of genes and then filter out all of the autism genes from this list. 

In [9]:
# Define long genes as genes with a cds region above 2000 base pairs. 
long_genes = prime_genes.query("lengths_cds > 3000")

In [10]:
# Merge the long genes with the asd_gene_names 
temp_df = pd.merge(long_genes, asd_gene_names, how="left", right_on="gene_name", left_on="gene_name_x", indicator = True)
# reset the indices.
temp_df = temp_df.reset_index(drop=True)
# Remove genes that match to the asd_gene_names dataframe. 
long_non_asd_genes = temp_df.loc[temp_df._merge == "left_only"].copy()

In [11]:
random.seed(35)
rn_genes = pd.DataFrame(random.sample(list(long_non_asd_genes.gene_name_x),len(asd_gene_names.gene_name)))
rn_genes.columns = ["gene_ID"]

## Step 8:
As a beginning step compare the total number of ASD genes vs the total number of random long genes that match to the top 200. 

In [12]:
# Merge the tables of random long genes and ASD genes with the top 200 translationally altered genes. 
top_200_rn = pd.merge(rn_genes, top_200, how="inner", on = "gene_ID")
top_200_asd = pd.merge(asd_gene_names, top_200, how = "inner", right_on = "gene_ID", left_on = "gene_name")

# Drop any duplicates that may have come into 
top_200_rn.drop_duplicates(inplace = True)
top_200_asd.drop_duplicates(inplace = True)

## Step 9:
Perform Fisher's exact test to determine if the probability of a translationally altered FMRP gene being an ASD gene is significantly different from it being a long gene. 

In [13]:
table = [[len(top_200_asd.gene_name), len(top_200.gene_ID)],[len(top_200_rn.gene_ID), len(top_200.gene_ID)]]
stats.fisher_exact(table)

(0.4444444444444444, 0.2596399601776505)

In [14]:
top_200_asd

Unnamed: 0,gene_name,gene_ID,Fold_change,padj,CDS_length_bps
0,Dys,Dys,0.62,4.46e-10,9381.0
1,Fmr1,Fmr1,1.82,2.03e-05,2052.0
2,Nf1,Nf1,0.31,0.0,8292.0
3,Prosap,Prosap,0.56,0.0,5418.0
