# __Reviewer feedback analysis 3__

- Country topical enrichment statistics

## ___Set up___

In [1]:
# For topic enrichment per country p-value multiple testing correction
from statsmodels.stats.multitest import multipletests

# For umap
import pickle, umap
import numpy as np
import pandas as pd
import umap.plot
import matplotlib.pyplot as plt
from pathlib import Path
from matplotlib import colors

# for animated gif
import imageio

# for movie
import cv2

from tqdm import tqdm

proj_dir = Path.home() / "projects/plant_sci_hist"

## ___Statistical significance of topical enrichment___

Reviewer 1:

"For Figure 5F it is not clear which of the changes are statistically significant, particularly if controlling for testing of multiple hypotheses."

Response: We have conduct statistical tests on these but did not report the results. They are now incorporated and we modified the figure to indicate significant differences after multiple testing corrections.


### Multiple testing correction

In [10]:
# stat is already generated, but not multiple-testing corrected

# Get the p-values
dir75     = proj_dir / "7_countries/7_5_country_over_time"
pval_file = dir75 / "country_top10_toc_pval.csv"

pval_df = pd.read_csv(pval_file, index_col=[0])
pval_df.head(2)

Unnamed: 0_level_0,toc_name,CHN,USA,JPN,DEU,FRA,GBR,IND,ESP,ITA,AUS
toc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,allergen | pollen | ige | allergenic,4.535153e-42,1.6095740000000002e-17,0.885126,1.095708e-12,0.026466,7e-06,0.7769514,1.318535e-47,6.467812e-08,0.321514
1,medium | callus | regeneration | culture | som...,3.583564e-38,1.828608e-05,0.538833,4.9877970000000005e-18,0.209628,0.018076,5.853805e-186,0.006176955,0.007353872,3e-06


In [32]:
countries = pval_df.columns[1:]
countries

Index(['CHN', 'USA', 'JPN', 'DEU', 'FRA', 'GBR', 'IND', 'ESP', 'ITA', 'AUS'], dtype='object')

In [33]:
# correct for each country
pval_corrected = []
for country in countries:
  pval = pval_df[country].values
  pval_corrected.append(multipletests(pval, method='fdr_bh')[1])

len(pval_corrected)

10

In [34]:
# Turn into np array than transpose
pval_corrected_arr = np.array(pval_corrected).T

# Convert to dataframe
pval_corrected_df = pd.DataFrame(pval_corrected_arr, columns=list(countries),
                                 index=pval_df.index)
print(pval_corrected_df.shape)

# output to csv
pval_corrected_df.to_csv(dir75 / "country_top10_toc_pval_corrected.csv")

(90, 10)


### Get corrected p-values for Fig 5F

In [35]:
# country order
c_order = ["JPN", "GBR", "DEU", "FRA", "USA", "AUS", "ESP", "ITA", "CHN", "IND"]

# topic order
t_order = [1, 69, 30, 9, 23, 28, 75, 86, 83, 21, 0, 54, 51]

In [42]:
# [[pval_country1], [pval_country2], ...]]
pvals_list = []
for c in c_order:
  pvals = []
  for t in t_order:
    pvals.append(pval_corrected_df.loc[t, c])
  pvals_list.append(pvals)


In [44]:
# Convert to np array
pvals_arr = np.array(pvals_list).T

# Convert to dataframe
pvals_df = pd.DataFrame(pvals_arr, columns=c_order, index=t_order)

# output to csv
pvals_df.to_csv(dir75 / "country_top10_toc_pval_corrected_Fig5F.csv")

In [45]:
pvals_df.head(5)

Unnamed: 0,JPN,GBR,DEU,FRA,USA,AUS,ESP,ITA,CHN,IND
1,0.563895,0.02324108,1.603221e-17,0.248244,2.420217e-05,8.656122e-06,0.01010774,0.01121777,7.167128e-38,2.634212e-184
69,5.984807e-152,1.680876e-73,1.956934e-30,2.0800039999999998e-24,1.110435e-218,2.532725e-22,2.6927629999999997e-36,0.8805576,3.0444509999999996e-203,0.6775499
30,1.693636e-40,0.08094176,0.1119947,0.07084776,4.823607e-07,0.2079011,0.2241177,0.02950271,2.599642e-09,0.1119393
9,1.079546e-14,7.847829e-10,1.054466e-08,4.100327e-06,2.153816e-20,0.9293587,0.03920159,0.5861011,7.847940000000001e-162,1.830068e-07
23,1.8103400000000002e-39,4.011821e-37,2.8895940000000003e-43,1.463937e-12,1.8898900000000002e-62,2.058462e-08,6.329757e-18,7.114919e-07,0.0,1.602068e-21


## ___Statistical significance of species enrichment___

Although this is not mentioned in the review, thought it makes sense to also get stats and modify Figure 4 on species enrichment.

The p-values are already generated in Dataset S10. Need to add a sheet with corrected p-values.

### Set up

In [7]:
# stat is already generated, but not multiple-testing corrected

# Get the p-values
dir53     = proj_dir / "5_species_over_time/5_3_sp_topic_time"
pval_file = dir53 / "table_df_toc_FET_top5_genera.xlsx"

pval_df_gly = pd.read_excel(pval_file, sheet_name="Glycine", index_col=[0])
pval_df_tri = pd.read_excel(pval_file, sheet_name="Triticum", index_col=[0])
pval_df_ory = pd.read_excel(pval_file, sheet_name="Oryza", index_col=[0])
pval_df_ara = pd.read_excel(pval_file, sheet_name="Arabidopsis", index_col=[0])
pval_df_nic = pd.read_excel(pval_file, sheet_name="Nicotiana", index_col=[0])

In [9]:
pval_df = pd.DataFrame((pval_df_gly["Pvalue"],
                        pval_df_ory["Pvalue"],
                        pval_df_tri["Pvalue"],
                        pval_df_ara["Pvalue"],
                        pval_df_nic["Pvalue"])).T

species         = ["Glycine", "Triticum", "Oryza", "Arabidopsis", "Nicotiana"]
pval_df.columns = species

pval_df.head(2)

Unnamed: 0_level_0,Glycine,Triticum,Oryza,Arabidopsis,Nicotiana
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.734667,0.025205,0.017289,6.732208999999999e-38,0.00427
1,0.000158,0.023164,0.456883,2.526232e-122,0.016604


### Multiple testing correction

In [10]:
# correct for each country
pval_corrected = []
for sp in species:
  pval = pval_df[sp].values
  pval_corrected.append(multipletests(pval, method='fdr_bh')[1])

len(pval_corrected)

5

In [11]:
# Turn into np array than transpose
pval_corrected_arr = np.array(pval_corrected).T

# Convert to dataframe
pval_corrected_df = pd.DataFrame(pval_corrected_arr, 
                                 columns=list(species),
                                 index=pval_df.index)
print(pval_corrected_df.shape)

# output to csv
pval_corrected_df.to_csv(dir53 / "species_top5_toc_pval_corrected.csv")

(90, 5)


### Get corrected p-values for Fig 5F

In [12]:
# country order
s_order = ["Arabidopsis", "Oryza", "Nicotiana", "Triticum", "Glycine"]

# topic order copied from illustrator
t_order_rev = [71,9,57,76,54,75,5,69,70,77,61,12,3,11,23,30,10,33,28,29,27,21,26]
t_order = t_order_rev[::-1]

t_order

[26,
 21,
 27,
 29,
 28,
 33,
 10,
 30,
 23,
 11,
 3,
 12,
 61,
 77,
 70,
 69,
 5,
 75,
 54,
 76,
 57,
 9,
 71]

In [13]:
# [[pval_sp1], [pval_sp2], ...]]
pvals_list = []
for s in s_order:
  pvals = []
  for t in t_order:
    pvals.append(pval_corrected_df.loc[t, s])
  pvals_list.append(pvals)


In [15]:
# Convert to np array
pvals_arr = np.array(pvals_list).T

# Convert to dataframe
pvals_df = pd.DataFrame(pvals_arr, columns=s_order, index=t_order)

# output to csv
pvals_df.to_csv(dir53 / "species_top10_toc_pval_corrected_Fig5F.csv")