In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                
        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [3]:
## Open original reference
original_ref = pd.read_csv("../../../../references/bernardo/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

In [4]:
## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

In [5]:
## Get gene ID and gene name from PSEN genes
orig_ref_PSEN = orig_ref.loc[orig_ref["gene_name"].isin(["PSEN1", "PSEN2"])][["gene_id", "gene_name"]]

In [6]:
## Clean up AD risk gene annotation and save it
ad_names = pd.read_csv("../../../../references/bernardo/AD_genes.csv", sep="\t")
ad_names = ad_names["gene_name"].drop_duplicates().dropna()
ad_names = pd.concat([ad_names, orig_ref_PSEN["gene_name"]])

ad_risk_gene_annotation = pd.merge(ad_names, orig_ref[["gene_name", "gene_id"]], on="gene_name")

ad_risk_gene_annotation.to_csv("../../../../references/bernardo/AD_gwas_genes.tsv", index=False, sep="\t")

FileNotFoundError: [Errno 2] No such file or directory: '../../../../references/bernardo/AD_genes.csv'

In [8]:
## Import disease relevant genes
disease_relevant_genes = pd.read_csv("../../../../references/bernardo/disease_relevant_gene_names.csv", 
                                     names=["gene_id", "gene_name"], header=0)

## Concatenate with AD risk genes, to make sure all AD GWAS Loci are included
disease_relevant_genes = pd.concat([disease_relevant_genes, ad_risk_gene_annotation]).drop_duplicates().reset_index(drop=True)

## Inner merge on gene_id and gene_name with ENSEMBL 107 reference to eliminate any duplicates
## Or mislabeled genes
disease_relevant_genes_clean = disease_relevant_genes.merge(orig_ref[["gene_name", "gene_id"]], 
                                                            on=["gene_name", "gene_id"], how="inner")

## Save clean version
disease_relevant_genes_clean.to_csv("../../../../references/bernardo/medically_relevant_genes.tsv", index=False, sep="\t")