In [1]:

import pandas as pd
import plotly.graph_objects as go
from scipy.stats import pearsonr

In [3]:

def split_string_with_dgrp(df, is_sex = True):
    """
    This function takes a dataframe as input and splits the 'genotype' column into two separate lists: DGRP and sex.
    
    Parameters:
    df (DataFrame): The input dataframe containing the 'genotype' and 'sex' columns.
    
    Returns:
    DGRP (list): A list of DGRP values extracted from the 'genotype' column.
    sex (list): A list of sex values extracted from the 'sex' column.
    """
    DGRP=[]
    sex = []
    for i in range(len(df)):
        if 'dgrp' in df.iloc[i, 1]:
            DGRP.append(df.iloc[i, 1])
            if is_sex:
                s= df.iloc[i,2]
                sex.append(s)
    return DGRP,sex

def merge_data(brain, behav, DGRP, sex=None, is_b2 = False):
    """
    This function merges the brain and behavior data based on the DGRP and sex values.
    
    Parameters:
    brain (DataFrame): The brain data dataframe.
    behav (DataFrame): The behavior data dataframe.
    DGRP (list): A list of DGRP values.
    sex (list): A list of sex values.
    
    Returns:
    merged_df (DataFrame): The merged dataframe containing the brain and behavior data.
    """
    if is_b2:
        data = behav[behav['genotype'].isin(DGRP)]
    else:
        data = behav[behav['genotype'].isin(DGRP) & behav["head_scanned"]==True]
    data['genotype'] = data['genotype'].apply(lambda x: 'DGRP_0' + x.split('dgrp')[1] if len(x.split('dgrp')[1]) == 2 else 'DGRP_' + x.split('dgrp')[1])
    brain['DGRP'] = brain['DGRP'].apply(lambda x: 'DGRP_0' + x if len(x) == 2 else 'DGRP_'+ x)

    
    data.rename(columns={'genotype': 'DGRP'}, inplace=True)


    if sex == None:
        merged_df = pd.merge(brain, data, on='DGRP')
    else:
        merged_df = pd.merge(brain, data, on=['DGRP', 'sex'])
    
    return merged_df

def calculate_pvalues(df):
    """
    This function calculates the p-values for the correlation matrix of a dataframe.
    
    Parameters:
    df (DataFrame): The input dataframe.
    
    Returns:
    pvalues (DataFrame): The dataframe containing the p-values for the correlation matrix.
    """
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues

In [4]:

behav = pd.read_csv("/Users/skumar/Documents/PhD/BrainAnalysis/Behavior/summary.csv")
brain = pd.read_csv("/Users/skumar/Project/PHD_work/GWAS/dataset/vol_hratio.csv", sep=",")
behav_2 = pd.read_csv("/Users/skumar/Documents/PhD/BrainAnalysis/Behavior/brain_behavior/pair_compare_sd.csv")


In [5]:
behav_2

Unnamed: 0,contrast,genotype,SE,df,t.ratio,p.value,genotype.1,estimate
0,1,dgrp100,0.043461,0.024539,4010.099248,1.771116,1.000000,noShock dgrp100 - shock dgrp100
1,2,dgrp105,0.039224,0.024557,4011.498474,1.597244,1.000000,noShock dgrp105 - shock dgrp105
2,3,dgrp153,0.089734,0.024994,4020.961623,3.590250,0.575007,noShock dgrp153 - shock dgrp153
3,4,dgrp189,0.049654,0.024802,4015.881690,2.001970,1.000000,noShock dgrp189 - shock dgrp189
4,5,dgrp195,0.019827,0.025522,4047.876561,0.776860,1.000000,noShock dgrp195 - shock dgrp195
...,...,...,...,...,...,...,...,...
63,64,dgrp897,0.038767,0.025193,4016.290394,1.538795,1.000000,noShock dgrp897 - shock dgrp897
64,65,dgrp91,-0.000734,0.024579,4010.650642,-0.029876,1.000000,noShock dgrp91 - shock dgrp91
65,66,dgrp911,0.001255,0.024591,4011.532001,0.051029,1.000000,noShock dgrp911 - shock dgrp911
66,67,dgrp913,0.031141,0.024804,4015.248662,1.255484,1.000000,noShock dgrp913 - shock dgrp913


In [6]:
DGRP,sex = split_string_with_dgrp(behav)
merged_df = merge_data(brain, behav, DGRP, sex)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['genotype'] = data['genotype'].apply(lambda x: 'DGRP_0' + x.split('dgrp')[1] if len(x.split('dgrp')[1]) == 2 else 'DGRP_' + x.split('dgrp')[1])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'genotype': 'DGRP'}, inplace=True)


In [7]:
merged_df

Unnamed: 0,DGRP,sex,abs_volume,h_ratio,id,exp_type,shock_color,shock_arm,treatment,replicates,...,post_incorrect_choices,post_analysis_time,group,group_sex,folder,experiment_date,experiment_by,head_scanned,abs_dev,abs_dev_sex
0,DGRP_737,female,0.015415,0.014041,2,operant_place,blue,M,shock,1,...,,,dgrp737_shock,dgrp737_shock_female,operant_place/20230707/csv/dgrp737_shock/female,20230707,riddha,True,0.207584,0.196769
1,DGRP_732,female,0.015346,0.014382,40,operant_place,blue,L,shock,2,...,,,dgrp732_shock,dgrp732_shock_female,operant_place/20230621/csv/dgrp732_shock/female,20230621,riddha,True,0.164331,0.167038
2,DGRP_354,female,0.018734,0.016683,40,operant_place,green,L,shock,1,...,,,dgrp354_shock,dgrp354_shock_female,operant_place/20230621/csv/dgrp354_shock/female,20230621,riddha,True,0.017099,0.00243
3,DGRP_786,female,0.018789,0.017888,2,operant_place,blue,M,shock,1,...,,,dgrp786_shock,dgrp786_shock_female,operant_place/20230705/csv/dgrp786_shock/female,20230705,riddha,True,0.148576,0.154391
4,DGRP_732,male,0.016771,0.018079,48,operant_place,blue,R,shock,2,...,,,dgrp732_shock,dgrp732_shock_male,operant_place/20230621/csv/dgrp732_shock/male,20230621,riddha,True,0.140436,0.137686
5,DGRP_309,female,0.017028,0.018391,1,operant_place,blue,L,shock,1,...,,,dgrp309_shock,dgrp309_shock_female,operant_place/20230703/csv/dgrp309_shock/female,20230703,riddha,True,0.053567,0.057445
6,DGRP_437,female,0.019896,0.019245,2,operant_place,blue,M,shock,1,...,,,dgrp437_shock,dgrp437_shock_female,operant_place/20230705/csv/dgrp437_shock/female,20230705,riddha,True,0.157227,0.158835
7,DGRP_850,male,0.021624,0.019396,9,operant_place,green,R,shock,1,...,,,dgrp850_shock,dgrp850_shock_male,operant_place/20230703/csv/dgrp850_shock/male,20230703,riddha,True,0.072065,0.06944
8,DGRP_031,male,0.019653,0.019432,48,operant_place,green,R,shock,2,...,,,dgrp31_shock,dgrp31_shock_male,operant_place/20230621/csv/dgrp31_shock/male,20230621,riddha,True,0.062256,0.066204
9,DGRP_737,male,0.018805,0.019528,10,operant_place,blue,L,shock,1,...,,,dgrp737_shock,dgrp737_shock_male,operant_place/20230707/csv/dgrp737_shock/male,20230707,riddha,True,0.080066,0.06908


In [10]:

DGRP, sex = split_string_with_dgrp(behav_2, is_sex=False)
DGRP

['dgrp100',
 'dgrp105',
 'dgrp153',
 'dgrp189',
 'dgrp195',
 'dgrp208',
 'dgrp217',
 'dgrp227',
 'dgrp228',
 'dgrp26',
 'dgrp303',
 'dgrp304',
 'dgrp309',
 'dgrp31',
 'dgrp313',
 'dgrp318',
 'dgrp319',
 'dgrp321',
 'dgrp354',
 'dgrp362',
 'dgrp373',
 'dgrp382',
 'dgrp385',
 'dgrp386',
 'dgrp390',
 'dgrp391',
 'dgrp395',
 'dgrp405',
 'dgrp437',
 'dgrp440',
 'dgrp486',
 'dgrp517',
 'dgrp551',
 'dgrp559',
 'dgrp563',
 'dgrp584',
 'dgrp59',
 'dgrp595',
 'dgrp627',
 'dgrp705',
 'dgrp721',
 'dgrp732',
 'dgrp737',
 'dgrp738',
 'dgrp75',
 'dgrp765',
 'dgrp776',
 'dgrp786',
 'dgrp787',
 'dgrp790',
 'dgrp801',
 'dgrp802',
 'dgrp805',
 'dgrp808',
 'dgrp818',
 'dgrp820',
 'dgrp822',
 'dgrp850',
 'dgrp853',
 'dgrp855',
 'dgrp859',
 'dgrp879',
 'dgrp892',
 'dgrp897',
 'dgrp91',
 'dgrp911',
 'dgrp913',
 'dgrp93']

In [14]:
data = behav_2[behav_2['genotype'].isin(DGRP)]

data

Unnamed: 0,contrast,genotype,SE,df,t.ratio,p.value,genotype.1,estimate
0,1,dgrp100,0.043461,0.024539,4010.099248,1.771116,1.000000,noShock dgrp100 - shock dgrp100
1,2,dgrp105,0.039224,0.024557,4011.498474,1.597244,1.000000,noShock dgrp105 - shock dgrp105
2,3,dgrp153,0.089734,0.024994,4020.961623,3.590250,0.575007,noShock dgrp153 - shock dgrp153
3,4,dgrp189,0.049654,0.024802,4015.881690,2.001970,1.000000,noShock dgrp189 - shock dgrp189
4,5,dgrp195,0.019827,0.025522,4047.876561,0.776860,1.000000,noShock dgrp195 - shock dgrp195
...,...,...,...,...,...,...,...,...
63,64,dgrp897,0.038767,0.025193,4016.290394,1.538795,1.000000,noShock dgrp897 - shock dgrp897
64,65,dgrp91,-0.000734,0.024579,4010.650642,-0.029876,1.000000,noShock dgrp91 - shock dgrp91
65,66,dgrp911,0.001255,0.024591,4011.532001,0.051029,1.000000,noShock dgrp911 - shock dgrp911
66,67,dgrp913,0.031141,0.024804,4015.248662,1.255484,1.000000,noShock dgrp913 - shock dgrp913


In [15]:

data['genotype'] = data['genotype'].apply(lambda x: 'DGRP_0' + x.split('dgrp')[1] if len(x.split('dgrp')[1]) == 2 else 'DGRP_' + x.split('dgrp')[1])
data

Unnamed: 0,contrast,genotype,SE,df,t.ratio,p.value,genotype.1,estimate
0,1,DGRP_100,0.043461,0.024539,4010.099248,1.771116,1.000000,noShock dgrp100 - shock dgrp100
1,2,DGRP_105,0.039224,0.024557,4011.498474,1.597244,1.000000,noShock dgrp105 - shock dgrp105
2,3,DGRP_153,0.089734,0.024994,4020.961623,3.590250,0.575007,noShock dgrp153 - shock dgrp153
3,4,DGRP_189,0.049654,0.024802,4015.881690,2.001970,1.000000,noShock dgrp189 - shock dgrp189
4,5,DGRP_195,0.019827,0.025522,4047.876561,0.776860,1.000000,noShock dgrp195 - shock dgrp195
...,...,...,...,...,...,...,...,...
63,64,DGRP_897,0.038767,0.025193,4016.290394,1.538795,1.000000,noShock dgrp897 - shock dgrp897
64,65,DGRP_091,-0.000734,0.024579,4010.650642,-0.029876,1.000000,noShock dgrp91 - shock dgrp91
65,66,DGRP_911,0.001255,0.024591,4011.532001,0.051029,1.000000,noShock dgrp911 - shock dgrp911
66,67,DGRP_913,0.031141,0.024804,4015.248662,1.255484,1.000000,noShock dgrp913 - shock dgrp913


Unnamed: 0,DGRP,sex,abs_volume,h_ratio
0,DGRP_DGRP_228,female,0.010634,0.009527
1,DGRP_DGRP_332,female,0.015221,0.012052
2,DGRP_DGRP_228,male,0.010962,0.012418
3,DGRP_DGRP_390,female,0.015531,0.012578
4,DGRP_DGRP_332,male,0.012916,0.013225
...,...,...,...,...
120,DGRP_DGRP_093,female,0.020374,0.031324
121,DGRP_DGRP_790,male,0.031894,0.031896
122,DGRP_DGRP_026,female,0.019917,0.032023
123,DGRP_DGRP_195,female,0.019900,0.032727


In [None]:
 
data.rename(columns={'genotype': 'DGRP'}, inplace=True)



In [None]:

    
merged_df = pd.merge(brain, data, on='DGRP')

In [9]:
merged_df

Unnamed: 0,DGRP,sex,abs_volume,h_ratio,id,exp_type,shock_color,shock_arm,treatment,replicates,...,head_scanned,abs_dev,abs_dev_sex,contrast,SE,df,t.ratio,p.value,genotype.1,estimate


In [None]:

correlation_matrix = merged_df[["abs_volume","h_ratio","activity","correct_choices","SE"]].corr()
"""
fig = px.imshow(correlation_matrix)
fig.show()

fig = px.imshow(calculate_pvalues(merged_df[["abs_volume","h_ratio","activity","correct_choices","frac_time_on_shocked"]]))
fig.show()
"""

p_values = calculate_pvalues(merged_df[["abs_volume", "h_ratio", "activity", "correct_choices", "SE"]])

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=["Absolute volume (std)", "H-ratio", "Activity", "Correct number of choices", "Place learning ability"],
    y=["Absolute volume (std)", "H-ratio", "Activity", "Correct number of choices", "Place learning ability"],
    colorscale="Blues",
    colorbar=dict(title="Correlation Coefficient")
))

names = ["Absolute volume (std)", "H-ratio", "Activity", "Correct number of choices", "Place learning ability"]
annotations = []
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        annotations.append(
            dict(
                x=names[j],
                y=names[i],
                text=f"p-value: {p_values.iloc[i, j]:.3f}",
                showarrow=False,
                font=dict(color="white" if abs(value) > 0.5 else "black")
            )
        )

fig.update_layout(
    title="Correlation Coefficient and p-values",
    annotations=annotations,
    xaxis=dict(title="Variable"),
    yaxis=dict(title="Variable"),
)

fig.show()