# Import needed modules

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import matplotlib.pyplot as plt
from scipy import stats
import scikit_posthocs as posthocs
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multitest import multipletests
import itertools
import seaborn as sns

# Read *HCV_Data_Germany_ngseminar* dataset

It contains data of patients with HCV and healthy people (blood donor).

[Dataset](http://archive.ics.uci.edu/ml/datasets/HCV+data) was downloaded from UCI [Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/HCV+data).

It was slightly modified by me for this seminar :).

It contains following variables:
* **Category** (nominal): 
    * Blood Donor
    * Fibrosis
    * Cirrhosis - very advanced, late fibrosis
    * Hepatitis - inflamation of liver tissue.
* **Healthy** (nominal, that's what I added to have nominal variables with 2 groups):
    * YES - for Category: Blood Donor
    * NO - for all other case
* **Age** (continous, but here it is rounded to integers)
* **Sex** (nominal): 
    * m - man
    * f - woman
* **ALB** (continous) - albumin. Albumins are globular, water-soluble proteins, commonly found in blood plasma.
* **ALP** (continous) - alkaline phosphatase; dephosphorylating enzyme, elevated levels of ALP are often found in patients with liver disease.
* **ALT** (continous) - alanine amino-transferase, alanine transaminase; enzyme involved in amino acids metabolism. Elevated ALT indicates health problems, often with liver (hepatitis, liver damage, bile duct problems).
* **AST** (continous) - aspartate amino-transferase, aspartate transaminase; enzyme involved in amino acids metabolism. Used to assess liver function, also together with ALT as AST/ALT ratio.
* **BIL** (continous) - bilirubin; metabolite produced as a result of breakdown of aged or abnormal red blood cells. It is responsible for yellow discoloration in jaundice. Elevated in patients with hepatitis.
* **CHE** (continous) - choline esterase; an enzyme. One of its types is produced mainly in liver and it may be lower in patients liver disease.
* **CHOL** (continous) - cholesterol.
* **CREA** (continous) - creatinine; metabolite, produced by muscle metabolism, used as marker of kindney function.
* **GGT** (continous) – gamma-glutamyl transferase; an enzyme involed, among the others, in xenobiotic detoxification. GGT is elevated in patients with liver disease (or after excess alcohol consumption).
* **PROT** (continous) – total protein.

We will use only some of them, but explanation for all doesn't harm.

In [2]:
hcvDataG = pd.read_csv('HCV_Data_Germany_ngseminar.csv', header = 0, index_col = 0)
display(hcvDataG)

Unnamed: 0_level_0,Category,Healthy,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Blood Donor,YES,32,m,38.5000,52.5000,7.7000,22.1000,7.5000,6.9300,3.2300,106.0000,12.1000,69.0000
2,Blood Donor,YES,32,m,38.5000,70.3000,18.0000,24.7000,3.9000,11.1700,4.8000,74.0000,15.6000,76.5000
3,Blood Donor,YES,32,m,46.9000,74.7000,36.2000,52.6000,6.1000,8.8400,5.2000,86.0000,33.2000,79.3000
4,Blood Donor,YES,32,m,43.2000,52.0000,30.6000,22.6000,18.9000,7.3300,4.7400,80.0000,33.8000,75.7000
5,Blood Donor,YES,32,m,39.2000,74.1000,32.6000,24.8000,9.6000,9.1500,4.3200,76.0000,29.9000,68.7000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,Cirrhosis,NO,59,f,39.0000,51.3000,19.6000,285.8000,40.0000,5.7700,4.5100,136.1000,101.1000,70.5000
612,Cirrhosis,NO,64,f,24.0000,102.8000,2.9000,44.4000,20.0000,1.5400,3.0200,63.0000,35.9000,71.3000
613,Cirrhosis,NO,64,f,29.0000,87.3000,3.5000,99.0000,48.0000,1.6600,3.6300,66.7000,64.2000,82.0000
614,Cirrhosis,NO,46,f,33.0000,,39.0000,62.0000,20.0000,3.5600,4.2000,52.0000,50.0000,71.0000


**Print unique categories**

In [3]:
for el in np.unique(hcvDataG.loc[:, 'Category']):
    print(el)

Blood Donor
Cirrhosis
Fibrosis
Hepatitis


**Print columns' names and check if variables are correctly listed above**

In [4]:
for colname in hcvDataG.columns:
    print(colname)

Category
Healthy
Age
Sex
ALB
ALP
ALT
AST
BIL
CHE
CHOL
CREA
GGT
PROT


# Read another dataset: *HCV_Data_Egypt_ngseminar*

It contains data of patients with HCV from Egypt.

[This dataset](http://archive.ics.uci.edu/ml/datasets/Hepatitis+C+Virus+%28HCV%29+for+Egyptian+patients) was downloaded from UCI [Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/HCV+data).

And it was also somehow modified by me for this seminar, so please use provided version rahter than original one :).

It contains more variables than the first (German) dataset:
* **Age** (continous, but here it is again rounded to integers)
* **Gender** (nominal, and yes, it is the same as Sex in previous example, but in medicine you're lucky, when one thing has only 2 names): 
    * M - man
    * F - woman
* **Fever** - just prescence (YES) or absence (NO) of fever. (Originally, it was coded as 1 for NO and 2 for YES, buy I've thought is too much for introduction to statistics ;).
* **Nausea/Vomiting** - works like fever variable and the same applies to a few next variables.
* **Headache**
* **Diarrhea**
* **Fatigue & generalized bone ache**
* **Jaundice**
* **Epigastric pain**
* **WBS** - white blood cells.
* **RBC** - red blood cells.
* **Plat** - platlets.
* **AST 1** - aspartate transaminase, measured at week 1.
* **ALT** -  alanine amino-transferase:
    * **ALT 1** - measured in week 1 (probably some time around the beginning of treatment).
    * **ALT 4** - at week 4.
    * **ALT 12** - at week 12.
    * **ALT 24** - at week 24.
    * **ALT 36** - at week 36.
    * **ALT 48** - at week 48.
* **RNA** - amount of viral RNA from HCV:
    * **RNA Base** - at baseline, before treatment.
    * **RNA 4** - at week 4.
    * **RNA 12** - at week 12.
    * **RNA EOT** - at the end of treatment.
* **RNA EF** - RNA Elongation Factor; a protein that plays important role in translation and has also some other function. It can be also exploited by viruses for their replication.
* **Baseline histological Grading** - there are a few systems of histologic grading for hepatitis; they can produce scores in ranges like 0-22 or 0-18. They are complex and pathologist needs to assess many features of tissue, like presence of inflammation, necrosis and fibrosis.
* **Baseline histological staging** - it describes level of liver fibrosis, the higher the score the more advanced fibrosis:
    * **0** - no fibsosis, but also no such case in our dataset.
    * **1** - mild fibrosis.
    * **2** - moderate fibrosis.
    * **3** - severe fibrosis.
    * **4** - cirrhosis (very advanced fibrosis).

In [5]:
hcvDataE = pd.read_csv('HCV_Data_Egypt_ngseminar.csv', header = 0, index_col = 0)
hcvDataE

Unnamed: 0_level_0,Age,Gender,BMI,Fever,Nausea/Vomiting,Headache,Diarrhea,Fatigue & generalized bone ache,Jaundice,Epigastric pain,...,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baseline histological Grading,Baseline histological staging
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,56,M,35,YES,NO,NO,NO,YES,YES,YES,...,5,5,5,655330,634536,288194,5,5,13,2
2,46,M,29,NO,YES,YES,NO,YES,YES,NO,...,57,123,44,40620,538635,637056,336804,31085,4,2
3,57,M,33,YES,YES,YES,YES,NO,NO,NO,...,5,5,5,571148,661346,5,735945,558829,4,4
4,49,F,33,NO,YES,NO,YES,NO,YES,NO,...,48,77,33,1041941,449939,585688,744463,582301,10,3
5,59,M,32,NO,NO,YES,NO,YES,YES,YES,...,94,90,30,660410,738756,3731527,338946,242861,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,44,M,29,NO,YES,YES,YES,NO,NO,NO,...,63,44,45,387795,55938,5,5,5,15,4
1382,55,M,34,NO,YES,YES,NO,NO,NO,NO,...,97,64,41,481378,152961,393339,73574,236273,10,2
1383,42,M,26,YES,YES,NO,NO,NO,YES,NO,...,87,39,24,612664,572756,806109,343719,160457,6,2
1384,52,M,29,YES,NO,NO,YES,YES,YES,NO,...,48,81,43,139872,76161,515730,2460,696074,15,3


In [6]:
for colname in hcvDataE.columns:
    print(colname)

Age
Gender
BMI
Fever
Nausea/Vomiting
Headache
Diarrhea
Fatigue & generalized bone ache
Jaundice
Epigastric pain
WBC
RBC
HGB
Plat
AST 1
ALT 1
ALT 4
ALT 12
ALT 24
ALT 36
ALT 48
ALT after 24 w
RNA Base
RNA 4
RNA 12
RNA EOT
RNA EF
Baseline histological Grading
Baseline histological staging


# Descriptive statistics

**We'll start with Germany dataset**

**Check how many cases and variables are in our dataset**

In [None]:
nrows, ncolumns = hcvDataG.shape
print('Number of cases (rows): {0:d}'.format(nrows))
print('Number of variables (columns): {0:d}'.format(ncolumns))

**Calculate descriptive statistics for age**

In [None]:
hcvDataG.loc[:, 'Age'].describe()

**Reapeat calculation dividing patients by gender**

In [None]:
hcvDataG.loc[:, 'Age'].groupby(hcvDataG.loc[:, 'Sex']).describe()

**Now describe 2 variable at a time: Age and CHOL, divide patients by gender**

In [None]:
hcvDataG.loc[:, ['Age', 'CHOL']].groupby(hcvDataG.loc[:, 'Sex']).describe()

## Distributions

**Examine visually distribution of CHOL**

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
pd.DataFrame.hist(data = hcvDataG, column = 'CHOL', ax = ax)
ax.grid(False)

Now divide patients by sex

In [None]:
pd.DataFrame.hist(data = hcvDataG, column = 'CHOL', by='Sex', figsize=(14,5))

**Let's repeat the same with BIL**

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
pd.DataFrame.hist(data = hcvDataG, column = 'BIL', ax = ax)
ax.grid(False)

In [None]:
pd.DataFrame.hist(data = hcvDataG, column = 'BIL', by='Sex', figsize=(14,5))

**Try to do the same wit ALB dividing patients by Category**

In [None]:
_ = pd.DataFrame.hist(data = hcvDataG, column = 'ALB', by='Category', layout=(2,2), figsize=(10,10))

# Statistical tests for continous variables

## Normality of distribution

**Check if age of patients is normally distributed in our dataset**

In [None]:
s, p = stats.shapiro(hcvDataG.loc[:, 'Age'])
print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))

**Repeat the test grouping patients by gender**

In [None]:
hcvDataG_by_gender = hcvDataG.groupby('Sex')

for group in hcvDataG_by_gender:
    groupName = group[0]
    groupData = hcvDataG_by_gender.get_group(groupName)
    
    s, p = stats.shapiro(groupData.loc[:, 'Age'])
    
    print('Group: {0:s}'.format(groupName))
    print('  test statistics: {0:.4f}'.format(s))
    print('  tp-value: {0:.4f}'.format(p))
    print()

**Now let's check ALB, ALP, CHOL and CREA, dividing patients by sex**

In [None]:
hcvDataG_by_stage = hcvDataG.groupby('Sex')
variablesForAnalysis = ['ALB', 'ALP', 'CHOL', 'CREA']

for variable in variablesForAnalysis:
    print(variable)
    for group in hcvDataG_by_stage:
        groupName = group[0]
        groupData = hcvDataG_by_stage.get_group(groupName)

        s, p = stats.shapiro(groupData.loc[:, variable].dropna())

        print('  Group: {0:s}'.format(str(groupName)))
        print('    test statistics: {0:.4f}'.format(s))
        print('    p-value: {0:.4f}'.format(p))
        
    print()

## Difference between 2 independent groups

**Compare ALP between healthy and ill people. Who has higher ALP?**

First, check the assumption for t-test:
* normality of distribution in both groups
* homogeneity of variance (basically equal varience in both groups)

In [None]:
hcvDataG_by_health = hcvDataG.groupby('Healthy')

for group in hcvDataG_by_health:
    groupName = group[0]
    groupData = hcvDataG_by_health.get_group(groupName)
    
    s, p = stats.shapiro(groupData.loc[:, 'ALP'].dropna())
    
    print('Group: {0:s}'.format(groupName))
    print('  test statistics: {0:.4f}'.format(s))
    print('  p-value: {0:.4f}'.format(p))
    print()

Let's try logarithm of ALP

In [None]:
hcvDataG_by_health = hcvDataG.groupby('Healthy')

for group in hcvDataG_by_health:
    groupName = group[0]
    groupData = hcvDataG_by_health.get_group(groupName)
    
    s, p = stats.shapiro(np.log(groupData.loc[:, 'ALP'].dropna()))
    
    print('Group: {0:s}'.format(groupName))
    print('  test statistics: {0:.4f}'.format(s))
    print('  p-value: {0:.4f}'.format(p))
    print()

Now, it's fine, we cen move to the next assumption - equal variances.

But before, let's add log ALP as a new variable to our data frame, we will need it.

In [None]:
hcvDataG.loc[:, 'log ALP'] = np.log(hcvDataG.loc[:, 'ALP'] )

In [None]:
hcvDataG_by_health = hcvDataG.groupby('Healthy')
  
hcvDataG_healthy_ALP = hcvDataG_by_health.get_group('YES').loc[:, 'log ALP'].dropna()
hcvDataG_ill_ALP = hcvDataG_by_health.get_group('NO').loc[:, 'log ALP'].dropna()

s, p = stats.bartlett(hcvDataG_healthy_ALP, hcvDataG_ill_ALP)

print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))
print()

Second assumption is not met, so let's check the difference with Mann-Whithey test.

In [None]:
s, p = stats.mannwhitneyu(hcvDataG_healthy_ALP, hcvDataG_ill_ALP, use_continuity = True, alternative='two-sided')

print('Mann-Whitney test for ALP in healthy and ill people')
print('  test statistics: {0:.4f}'.format(s))
print('  p-value: {0:.4f}'.format(p))

Would the conclusion change if we didn't use log-transform?

In [None]:
hcvDataG_healthy_ALP_noLog = hcvDataG_by_health.get_group('YES').loc[:, 'ALP'].dropna()
hcvDataG_ill_ALP_noLog = hcvDataG_by_health.get_group('NO').loc[:, 'ALP'].dropna()

s, p = stats.mannwhitneyu(hcvDataG_healthy_ALP_noLog, hcvDataG_ill_ALP_noLog, 
        use_continuity = True, alternative='two-sided')

print('Mann-Whitney test for ALP in healthy and ill people')
print('  test statistics: {0:.4f}'.format(s))
print('  p-value: {0:.4f}'.format(p))

**It's exactly the same. Why?**

**And finally, let's see who has higher ALP.**

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
hcvDataG.boxplot(column = 'ALP', by='Healthy', ax=ax, grid=False)

**Would the conclusion change if we used t-test instead?**

In [None]:
s, p = stats.ttest_ind(hcvDataG_healthy_ALP, hcvDataG_ill_ALP, nan_policy='omit')

print('Mann-Whitney test for ALP in healthy and ill people')
print('  test statistics: {0:.4f}'.format(s))
print('  p-value: {0:.4f}'.format(p))

In [None]:
s, p = stats.ttest_ind(hcvDataG_healthy_ALP_noLog, hcvDataG_ill_ALP_noLog, nan_policy='omit')

print('Mann-Whitney test for ALP in healthy and ill people')
print('  test statistics: {0:.4f}'.format(s))
print('  p-value: {0:.4f}'.format(p))

## Differences between 3 or more independent groups

**Check if patients with different 'Category' have differnt ALB**

Start by verification of assumptions for ANOVA

In [None]:
hcvDataG_by_cat = hcvDataG.groupby('Category')
varName = 'ALP'

for group in hcvDataG_by_cat:
    groupName = group[0]
    groupData = hcvDataG_by_cat.get_group(groupName)
    
    s, p = stats.shapiro(groupData.loc[:, varName].dropna())
    
    print('Group: {0:s}'.format(groupName))
    print('  test statistics: {0:.4f}'.format(s))
    print('  p-value: {0:.4f}'.format(p))
    print()

Normality assumption is not met, so let's use Kruskal-Wallis test to compare the groups.

In [None]:
hcvDonors = hcvDataG_by_cat.get_group('Blood Donor').loc[:, varName]
hcvCirrhosis = hcvDataG_by_cat.get_group('Cirrhosis').loc[:, varName]
hcvFibrosis = hcvDataG_by_cat.get_group('Fibrosis').loc[:, varName]
hcvHepatitis = hcvDataG_by_cat.get_group('Hepatitis').loc[:, varName]

s, p = stats.kruskal(hcvDonors, hcvCirrhosis, hcvFibrosis, hcvHepatitis, nan_policy = 'omit')

print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))

Which pairs of groups differ?

In [None]:
post_hoc_results = posthocs.posthoc_dunn(hcvDataG, val_col = varName, group_col = 'Category')
post_hoc_results

In [None]:
group_names = list(np.unique(hcvDataG.loc[:, 'Category']))

for comb in itertools.combinations(group_names, r=2):
    names = list(comb)
    print('{0:s} vs {1:s}: p={2:.4f}'.format(names[0], names[1], post_hoc_results.loc[names[0], names[1]]))

Finally, let's look at the differences

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
hcvDataG.boxplot(column = varName, by='Category', ax=ax, grid=False)
# ax.set_ylim(15,65)

**What would happen with ANOVA applied to the task?**

In [None]:
s, p = stats.f_oneway(
    hcvDonors.dropna(),
    hcvCirrhosis.dropna(),
    hcvFibrosis.dropna(),
    hcvHepatitis.dropna(),
)

print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))

In [None]:
post_hoc_results = posthocs.posthoc_tukey(hcvDataG, val_col=varName, group_col='Category')
post_hoc_results

In [None]:
group_names = list(np.unique(hcvDataG.loc[:, 'Category']))

for comb in itertools.combinations(group_names, r=2):
    names = list(comb)
    print('{0:s} vs {1:s}: p={2:.4f}'.format(names[0], names[1], post_hoc_results.loc[names[0], names[1]]))

## Differences between 2 dependent groups

**Now, we will move to the segond (Egyptian) dataset.**

**Check is amount of viral RNA changed between the baseline (RNA Base) and end of treatment (RNA EOT)**

Start checking assumptions of parametric (t-test for related/dependant).

In [None]:
varNames = ['RNA Base', 'RNA EOT']
hcvRNA = hcvDataE.loc[:, varNames].dropna(axis='index', how='any')

In [None]:
for varName in varNames:
    s, p = stats.shapiro(hcvRNA.loc[:, varName])

    print('Group: {0:s}'.format(varName))
    print('  test statistics: {0:.4f}'.format(s))
    print('  p-value: {0:.4f}'.format(p))
    print()

At this point, you can check if logarithmic transformation of values makes distribution normal (buyt here it doesn't).

To analyze the changes in viral RNA, we will use non-parametric test: Wilcoxon test.

In [None]:
s, p = stats.wilcoxon(hcvRNA.loc[:, 'RNA Base'], hcvRNA.loc[:, 'RNA EOT'])

print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))

Amount of viral DNA changes during treatment, so try to visualize it.

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
hcvRNA.boxplot(column = varNames, ax=ax, grid=False)

**What if we used t-test instead?**

In [None]:
s, p = stats.ttest_rel(hcvRNA.loc[:, 'RNA Base'], hcvRNA.loc[:, 'RNA EOT'])

print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))

## Differences between 3 or more related measurments

**Analyse changes in amounts viral RNA along the treatment**

We check normality of distribution at baseline at and at the end of treatment, so we know that assumptions for ANOVA are not met. We can move directly to Friedman test.

In [None]:
varNames = ['RNA Base', 'RNA 4', 'RNA 12', 'RNA EOT']
hcvRNA = hcvDataE.loc[:, varNames].dropna(axis='index', how='any')

In [None]:
s, p = stats.friedmanchisquare(
    hcvRNA.loc[:, 'RNA Base'],
    hcvRNA.loc[:, 'RNA 4'],
    hcvRNA.loc[:, 'RNA 12'],
    hcvRNA.loc[:, 'RNA EOT'],
)

print('test statistics: {0:.4f}'.format(s))
print('p-value: {0:.4f}'.format(p))

In [None]:
post_hoc_results = posthocs.posthoc_nemenyi_friedman(hcvRNA, y_col=varNames)
post_hoc_results.loc[varNames, varNames]

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
_ = hcvRNA.boxplot(column = varNames, ax=ax, grid=False)
_ = ax.set_ylim(top=15*10**5)

In [None]:
hcvRNA.loc[:, 'Patient'] = hcvRNA.index
hcvRNAmelted = pd.melt(hcvRNA, id_vars = 'Patient', value_vars = varNames, var_name = 'Timepoint', value_name = 'RNA')
hcvRNAmelted

In [None]:
patientIDs = np.unique(hcvRNAmelted['Patient'])
selectedPatientIds = np.random.choice(patientIDs, size=10, replace=False)

selectedRows = [id in list(selectedPatientIds) for id in hcvRNAmelted['Patient']]

hcvRNAmelted_some_patients = hcvRNAmelted.loc[selectedRows,:]
hcvRNAmelted_some_patients

ax = sns.pointplot(data=hcvRNAmelted_some_patients, x='Timepoint', y='RNA', hue='Patient', palette='colorblind',
                  legend=False)
ax.get_legend().remove()

**Would the conclusion change if we applied ANOVA for repeated measurments?**

In [None]:
anova_rm_rna = AnovaRM(hcvRNAmelted, depvar = 'RNA', subject = 'Patient', within = ['Timepoint'])
anova_rm_rna_res = anova_rm_rna.fit()

print(anova_rm_rna_res)

In [None]:
post_hoc_res = pairwise_tukeyhsd(hcvRNAmelted.loc[:, 'RNA'], groups = hcvRNAmelted.loc[:, 'Timepoint'])
post_hoc_res.summary()

# Correlations

**Check if laboratory results at different timepoints are correlated**

TODO: choose only some of them!!!

In [None]:
varNames = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
hcvLab = hcvDataG.loc[:, varNames]
# hcvRNA = hcvDataE.loc[:, varNames].dropna(axis='index', how='any')

In [None]:
hcvLab.corr(method='spearman')

In [None]:
hcvLab.corr(method='pearson')

In [None]:
hcvLab

**If you want also p-values, you need scipy**

In [None]:
r, p = stats.spearmanr(hcvLab, nan_policy='omit')

display(pd.DataFrame(r, index=varNames, columns=varNames))

display(pd.DataFrame(p, index=varNames, columns=varNames))

But the same doesn't work with Pearson correlation...

In [None]:
r = pd.DataFrame(np.zeros([len(varNames),len(varNames)]), columns=varNames, index=varNames)
p = pd.DataFrame(np.zeros([len(varNames),len(varNames)]), columns=varNames, index=varNames)

for comb in itertools.combinations(varNames, r=2):
    names = list(comb)
    name1 = names[0]
    name2 = names[1]
    hcvLabCurrent = hcvLab.loc[:, [name1, name2]].dropna()
    cr, cp = stats.pearsonr(hcvLabCurrent.loc[:, name1], hcvLabCurrent.loc[:, name2])
    r.loc[name1, name2] = r.loc[name2, name1] = cr
    p.loc[name1, name2] = p.loc[name2, name1] = cp

display(r)

display(p)

Let's visualize the correlation between CHOL and ALT

In [None]:
hcvLab.plot.scatter(x='CHOL', y='PROT')

Let's look at all correlations at once

In [None]:
_ = sns.pairplot(hcvLab)

In [None]:
help(sns.pairplot)

Repeat this type of analysis for viral RNA variables in Egyptian dataset. Don't forget the plots.

Can you tell what is wrong in that analysis?

# Statistical tests for nominal variables

**Using Egyptian dataset, check if Nausea/Vomiting is equaly common in patients with and without Epigastric pain**

In [None]:
hcvDataE.columns

In [None]:
table1 = pd.crosstab(hcvDataE['Epigastric pain'], hcvDataE['Nausea/Vomiting'])
table1

In [None]:
table1_percent_rows = table1.apply(lambda r: r/r.sum()*100, axis=1)
table1_percent_rows

In [None]:
table1_percent_cols = table1.apply(lambda r: r/r.sum()*100, axis=0)
table1_percent_cols

In [None]:
s, p, _, _ = stats.chi2_contingency(table1)
OR, pf = stats.fisher_exact(table1)
print(p)
print(pf)
print()

**Now, check if presence of Epigastric pain is related to Baseline histological staging**

In [None]:
table2 = pd.crosstab(hcvDataE['Baseline histological staging'], hcvDataE['Epigastric pain'])
display(table2)
s, p, _, _ = stats.chi2_contingency(table2)
print(p)
print()

In [None]:
table2_percent_rows = table2.apply(lambda r: r/r.sum()*100, axis=1)
table2_percent_rows

In [None]:
table2_percent_cols = table2.apply(lambda r: r/r.sum()*100, axis=0)
table2_percent_cols

# Multiple testing

In [None]:
ratsData = pd.read_csv('rats_ngseminar.csv', sep=';', index_col=0, header=[0,1])

In [None]:
display(ratsData)

In [None]:
ratsDataGrouped = ratsData.groupby(level=['Group'], axis=1)
ratsDataResults = ratsDataGrouped.apply(np.mean, axis=1) 
display(ratsDataResults)

In [None]:
healthy = ratsDataGrouped.get_group('healthy')
het = ratsDataGrouped.get_group('ko')
ko = ratsDataGrouped.get_group('het')
sarcoma = ratsDataGrouped.get_group('sarcoma')

s, p = stats.stats.f_oneway(healthy, ko, het, sarcoma, axis=1)

ratsDataResults.loc[:, 'p'] = p

display(ratsDataResults.sort_values(by='p'))

**How many p-values below 0.05 do we have?**

In [None]:
sigP = (ratsDataResults.loc[:, 'p']<0.05).sum()
sigP_percent = sigP/ratsDataResults.shape[0]*100

print('Number of p-values below 0.05: {0:d}'.format(sigP))
print('It is {0:.1f}% of all genes.'.format(sigP_percent))

**Let's apply FDR and FWER correction**

In [None]:
fdr = multipletests(ratsDataResults.loc[:, 'p'], alpha=0.05, method='fdr_bh')
ratsDataResults['FDR'] = fdr[1]

In [None]:
fwer = multipletests(ratsDataResults.loc[:, 'p'], alpha=0.05, method='bonferroni')
ratsDataResults['FWER'] = fwer[1]

In [None]:
sigFDR = (ratsDataResults.loc[:, 'FDR']<0.05).sum()
sigFDR_percent = sigFDR/ratsDataResults.shape[0]*100

print('Number of FDR-corrected p-values below 0.05: {0:d}'.format(sigFDR))
print('It is {0:.1f}% of all genes.'.format(sigFDR_percent))

sigFWER = (ratsDataResults.loc[:, 'FWER']<0.05).sum()
sigFWER_percent = sigFWER/ratsDataResults.shape[0]*100

print()
print('Number of FWER-corrected p-values below 0.05: {0:d}'.format(sigFWER))
print('It is {0:.1f}% of all genes.'.format(sigFWER_percent))