In [26]:
import pandas as pd
from scipy.stats import ttest_ind, pearsonr

In [27]:
df = pd.read_excel('NewCorr/2g_new.xlsx')

In [28]:
df.head()

Unnamed: 0,Location,Infections,deaths,mask,Masks,Total Floor Space,Residential Settlement (% of area of floor space),Industrial and Commercial Settlement (%),"Sports, leisure, recreation (%)",Other (%),...,Water Bodies (%),Total Population,Female,Male,(Female) % of Population,(Male) % of Population,(Female) % of Foreigner_s Population,(Male) % of Foreigner_s Population,% of Total Population,Inhabitants per km2
0,Ahrweiler,19256,38,No,0,78703,3.206993,0.998691,1.644156,1.288388,...,1.617473,130479,66135,64344,50.686317,49.313683,46.391909,53.608091,10.079017,165.788477
1,Altenkirchen,23304,38,No,0,64238,6.149008,1.62832,0.899779,1.460195,...,0.891995,129087,65144,63943,50.46519,49.53481,46.036202,53.963798,8.00313,200.949965
2,Alzey-Worms,26284,64,No,0,58807,3.45197,1.226045,1.81611,1.947047,...,1.591647,130715,65751,64964,50.301037,49.698963,45.440613,54.559387,9.983552,222.278045
3,Bad Dürkheim,23045,43,No,0,59464,3.713171,1.072918,2.097067,1.439526,...,0.662586,133004,67901,65103,51.051848,48.948152,47.748691,52.251309,8.616282,223.672048
4,Bad Kreuznach,36662,83,No,0,86389,3.283983,1.3254,1.862506,1.842827,...,0.971188,158746,81197,77549,51.149005,48.850995,48.61389,51.38611,11.02075,183.756662


In [29]:
columns = list(df.columns)

In [30]:
columns

['Location',
 'Infections',
 'deaths',
 'mask',
 'Masks',
 'Total Floor Space',
 'Residential Settlement (% of area of floor space)',
 'Industrial and Commercial Settlement (%)',
 'Sports, leisure, recreation (%)',
 'Other (%)',
 'Traffic/Transport (%)',
 'Agriculture  (%)',
 'Forest (%)',
 'Others (%)',
 'Water Bodies (%)',
 'Total Population',
 'Female',
 'Male',
 '(Female) % of Population',
 '(Male) % of Population',
 '(Female) % of Foreigner_s Population',
 '(Male) % of Foreigner_s Population',
 '% of Total Population',
 'Inhabitants per km2']

In [31]:
# Disregard the Location and mask columns as they are non-numeric
# the mask column has also been stored as a boolean value in the Masks column

columns.remove("Location")
columns.remove("mask")

In [32]:
columns

['Infections',
 'deaths',
 'Masks',
 'Total Floor Space',
 'Residential Settlement (% of area of floor space)',
 'Industrial and Commercial Settlement (%)',
 'Sports, leisure, recreation (%)',
 'Other (%)',
 'Traffic/Transport (%)',
 'Agriculture  (%)',
 'Forest (%)',
 'Others (%)',
 'Water Bodies (%)',
 'Total Population',
 'Female',
 'Male',
 '(Female) % of Population',
 '(Male) % of Population',
 '(Female) % of Foreigner_s Population',
 '(Male) % of Foreigner_s Population',
 '% of Total Population',
 'Inhabitants per km2']

In [33]:
# Result WITH basic interpretations

print("______" * 10)
print("\n")
print("Dataset: 2g")
print("\n")
print("______" * 10)
print("\n")

# t-test and r-test for each pair of columns
# we iterate through every column and test the hypothesis against each pair

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        col1 = columns[i]
        col2 = columns[j]
        print(f"t-test and r-test for {col1} and {col2}")
        print("****" * 10)
        
        # Calculating the t-statistic and p-value for it
        t, p = ttest_ind(df[col1], df[col2])
        
        # Calculating the r-value and its significant p-value
        r, p_r = pearsonr(df[col1], df[col2])
        
        # The p-values have been considered for up to 5 decimal places as we
        # need it to interpret it against the level of significance (0.05)
        
        print(f"t-statistic: {t:.2f}")
        print(f"p-value: {p:.5f}")
        print(f"r-value: {r:.2f}")
        print(f"p-value (r-test): {p_r:.5f}")
        
        #Adding basic interpretations for each variable
        print("\nInterpretations:")
        print("......" * 10)

        if p < 0.05:
            print(f"1. p-value (t) {p:.5f} < 0.05 :- there is a statistically significant difference between the two groups.")
        else:
            print(f"1. p-value (t) {p:.5f} > 0.05 :- there is not a statistically significant difference between the two groups.")
        if abs(r) >= 0.7:
            print(f"2. {r:.2f} >= 0.7 :- strong positive or negative correlation between the two variables.")
        elif abs(r) >= 0.3:
            print(f"2. {r:.2f} >= 0.3 :- moderate positive or negative correlation between the two variables.")
        else:
            print(f"2. {r:.2f} < 0.3 :- weak or no correlation between the two variables.")
        if p_r < 0.05:
            print(f"3. p-value (r) {p_r:.5f} < 0.05 :- there is a statistically significant correlation between the two variables.")
        else:
            print(f"3. p-value (r) {p_r:.5f} > 0.05 :-there is not a statistically significant correlation between the two variables.")
        print("\n")

____________________________________________________________


Dataset: 2g


____________________________________________________________


t-test and r-test for Infections and deaths
****************************************
t-statistic: 12.44
p-value: 0.00000
r-value: 0.34
p-value (r-test): 0.04541

Interpretations:
............................................................
1. p-value (t) 0.00000 < 0.05 :- there is a statistically significant difference between the two groups.
2. 0.34 >= 0.3 :- moderate positive or negative correlation between the two variables.
3. p-value (r) 0.04541 < 0.05 :- there is a statistically significant correlation between the two variables.


t-test and r-test for Infections and Masks
****************************************
t-statistic: 12.46
p-value: 0.00000
r-value: nan
p-value (r-test): nan

Interpretations:
............................................................
1. p-value (t) 0.00000 < 0.05 :- there is a statistically significant difference be



t-statistic: -9.02
p-value: 0.00000
r-value: 0.91
p-value (r-test): 0.00000

Interpretations:
............................................................
1. p-value (t) 0.00000 < 0.05 :- there is a statistically significant difference between the two groups.
2. 0.91 >= 0.7 :- strong positive or negative correlation between the two variables.
3. p-value (r) 0.00000 < 0.05 :- there is a statistically significant correlation between the two variables.


t-test and r-test for Other (%) and Agriculture  (%)
****************************************
t-statistic: -16.06
p-value: 0.00000
r-value: -0.29
p-value (r-test): 0.08902

Interpretations:
............................................................
1. p-value (t) 0.00000 < 0.05 :- there is a statistically significant difference between the two groups.
2. -0.29 < 0.3 :- weak or no correlation between the two variables.
3. p-value (r) 0.08902 > 0.05 :-there is not a statistically significant correlation between the two variables.


t-test

In [34]:
# Result WITHOUT basic interpretation

print("______" * 10)
print("\n")
print("Dataset: 2g")
print("\n")
print("______" * 10)
print("\n")

# t-test and r-test for each pair of columns
# we iterate through every column and test the hypothesis against each pair

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        col1 = columns[i]
        col2 = columns[j]
        print(f"t-test and r-test for {col1} and {col2}")
        print("****" * 10)
        
        # Calculating the t-statistic and p-value for it
        t, p = ttest_ind(df[col1], df[col2])
        
        # Calculating the r-value and its significant p-value 
        r, p_r = pearsonr(df[col1], df[col2])
        
        # The p-values have been considered for up to 5 decimal places as we
        # need it to interpret it against the level of significance (0.05)
        
        print(f"t-statistic: {t:.2f}")
        print(f"p-value: {p:.5f}")
        print(f"r-value: {r:.2f}")
        print(f"p-value (r-test): {p_r:.5f}")
        print("\n")

____________________________________________________________


Dataset: 2g


____________________________________________________________


t-test and r-test for Infections and deaths
****************************************
t-statistic: 12.44
p-value: 0.00000
r-value: 0.34
p-value (r-test): 0.04541


t-test and r-test for Infections and Masks
****************************************
t-statistic: 12.46
p-value: 0.00000
r-value: nan
p-value (r-test): nan


t-test and r-test for Infections and Total Floor Space
****************************************
t-statistic: -4.89
p-value: 0.00001
r-value: 0.29
p-value (r-test): 0.08821


t-test and r-test for Infections and Residential Settlement (% of area of floor space)
****************************************
t-statistic: 12.45
p-value: 0.00000
r-value: -0.04
p-value (r-test): 0.83027


t-test and r-test for Infections and Industrial and Commercial Settlement (%)
****************************************
t-statistic: 12.46
p-value: 0.00000
r-va

t-statistic: -192.89
p-value: 0.00000
r-value: -0.05
p-value (r-test): 0.78679


t-test and r-test for Others (%) and (Male) % of Foreigner_s Population
****************************************
t-statistic: -218.14
p-value: 0.00000
r-value: 0.05
p-value (r-test): 0.78679


t-test and r-test for Others (%) and % of Total Population
****************************************
t-statistic: -12.35
p-value: 0.00000
r-value: 0.13
p-value (r-test): 0.43549


t-test and r-test for Others (%) and Inhabitants per km2
****************************************
t-statistic: -5.18
p-value: 0.00000
r-value: 0.07
p-value (r-test): 0.67684


t-test and r-test for Water Bodies (%) and Total Population
****************************************
t-statistic: -13.24
p-value: 0.00000
r-value: 0.11
p-value (r-test): 0.53595


t-test and r-test for Water Bodies (%) and Female
****************************************
t-statistic: -13.21
p-value: 0.00000
r-value: 0.11
p-value (r-test): 0.52905


t-test and r-test for