In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import csv
from supervenn import supervenn
import os
from scipy import stats
from biomart import BiomartServer
import io
from matplotlib import rcParams

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
def describe_column_across_dfs(dfs, labels, column):
    """
    Generate a combined description of a specified column across multiple dataframes.
    
    Parameters:
    dfs (list of pd.DataFrame): List of dataframes to describe.
    labels (list of str): List of labels corresponding to each dataframe.
    column (str): The column name to describe.
    
    Returns:
    pd.DataFrame: A dataframe containing the description of the specified column across all dataframes.
    """
    if len(dfs) != len(labels):
        raise ValueError("The number of dataframes must match the number of labels")
    
    desc_dict = {}
    for df, label in zip(dfs, labels):
        if column in df.columns:
            desc_df = df[column].describe()
            desc_dict[label] = desc_df
        else:
            raise ValueError(f"Column '{column}' not found in dataframe labeled '{label}'")
    
    combined_desc = pd.DataFrame(desc_dict)
    combined_desc.reset_index(inplace=True)
    combined_desc.rename(columns={'index': 'Statistic'}, inplace=True)
    
    return combined_desc

In [3]:
## Import heterogeneity data
df_temporal = pd.read_csv("../../data/processed_data/temporal_lobe/temporal_lobe_meta-analysis_processed_results.tsv", sep="\t")
df_frontal = pd.read_csv("../../data/processed_data/frontal_lobe/frontal_lobe_meta-analysis_processed_results.tsv", sep="\t")

In [4]:
## Filter only heterogeneity data
df_temporal = df_temporal[['chr', 'gene_id', 'gene_symbol', 'regulation', 'heterogeneity_I_squared', 'heterogeneity_Chi_squared',
       'heterogeneity_degrees_of_freedom_squared', 'heterogeneity_P-value']]

df_frontal = df_frontal[['chr', 'gene_id', 'gene_symbol', 'regulation', 'heterogeneity_I_squared', 'heterogeneity_Chi_squared',
       'heterogeneity_degrees_of_freedom_squared', 'heterogeneity_P-value']]

In [5]:
## Dataframe lists
df_list = [df_temporal, df_frontal]
df_labels = ["temporal_lobe_i_squared_statistics", "frontal_lobe_i_squared_statistics"]

In [6]:
## Create description dataframe
describe_df = describe_column_across_dfs(dfs=df_list, labels=df_labels, column="heterogeneity_I_squared")

In [7]:
describe_df.to_csv("../../data/processed_data/heterogeneity/heterogeneity_i_squared_statistics.tsv", sep="\t", index=False)

In [8]:
describe_df

Unnamed: 0,Statistic,temporal_lobe_i_squared_statistics,frontal_lobe_i_squared_statistics
0,count,29492.0,31378.0
1,mean,25.229089,9.794442
2,std,34.317103,21.841493
3,min,0.0,0.0
4,25%,0.0,0.0
5,50%,0.0,0.0
6,75%,58.2,0.0
7,max,98.7,93.9
