# Mann Whitney U test

In [9]:
# import modules
import pandas as pd
import numpy as np
import os

from scipy.stats import mannwhitneyu

This notebook contains the code to perform the Mann Whitney U test on the energy consumptions. We start with the Adult set, after which we will proceed to the Student Performance set. All cells should have sufficient comments and documentation to be understood, if not feel free to reach out!

## Adult data set

### First for $k$-anonymity

In [10]:
models = ['knn','logreg','nn']
ks = [3,10,27]
data = 'Adult'
values = []

for k in ks:
    for model in models:
        df = pd.read_csv('Energy/Energy_' + str(model) + '.csv')
        df.columns = ['label','timestamp','duration','pkg','dram','socket']
        
        desired_data = str(data) + '_' + str(k)
        df = df.loc[df['label'] == desired_data]
        
        pkg = df['pkg']
        dram = df['dram']
        pkg *= (10**-6)
        dram *= (10**-6)
             
        total = pkg + dram
        values.append(total.to_list())

# combine ML methods in one list
k_3 = values[0] + values[1] + values[2]
k_10 = values[3] + values[4] + values[5]
k_27 = values[6] + values[7] + values[8]

# print lenths and sum to check
print(len(k_3), len(k_10), len(k_27))
print(sum(k_3), sum(k_10), sum(k_27))

30 30 30
2423.1678420000003 2162.003201 1827.5290659999996


### Then for synthetic data

In [11]:
values = []

for model in models:
    df = pd.read_csv('../Synthetic_data/Energy/Energy_' + str(model) + '.csv')
    df.columns = ['label','timestamp','duration','pkg','dram','socket']

    desired_data = str(data)
    df = df.loc[df['label'] == desired_data]

    pkg = df['pkg']
    dram = df['dram']
    pkg *= (10**-6)
    dram *= (10**-6)

    total = pkg + dram
    values.append(total.to_list())

# combine ML methods in one list
synth = values[0] + values[1] + values[2]

# print lenths and sum to check
print(len(synth), sum(synth))

60 10896.045939


### Finally, the Mann-Whitney U test itself for the Adult set

In [21]:
k3_vs_k10 = mannwhitneyu(k_3, k_10, method="exact", alternative="greater")
k3_vs_k27 = mannwhitneyu(k_3, k_27, method="exact", alternative="greater")
k3_vs_synth = mannwhitneyu(k_3, synth, method="exact", alternative="greater")

k10_vs_k27 = mannwhitneyu(k_10, k_27, method="exact", alternative="greater")
k10_vs_synth = mannwhitneyu(k_10, synth, method="exact", alternative="greater")

k27_vs_synth = mannwhitneyu(k_27, synth, method="exact", alternative="greater")

print("\nk=3 vs 10, 27, synth")
print(k3_vs_k10)
print(k3_vs_k27)
print(k3_vs_synth)

print("\nk=10 vs 27, synth")
print(k10_vs_k27)
print(k10_vs_synth)

print("\nk=27 vs synth")
print(k27_vs_synth)


k=3 vs 10, 27, synth
MannwhitneyuResult(statistic=648.0, pvalue=0.001517341923053091)
MannwhitneyuResult(statistic=662.0, pvalue=0.0007216102699375835)
MannwhitneyuResult(statistic=413.0, pvalue=0.9999917646288415)

k=10 vs 27, synth
MannwhitneyuResult(statistic=609.0, pvalue=0.009165849344885305)
MannwhitneyuResult(statistic=402.0, pvalue=0.9999949254716267)

k=27 vs synth
MannwhitneyuResult(statistic=402.0, pvalue=0.9999949254716267)


## Student Performance set

### Next for $k$-anonymity

In [13]:
data = 'Student'
values = []

for k in ks:
    for model in models:
        df = pd.read_csv('Energy/Energy_' + str(model) + '.csv')
        df.columns = ['label','timestamp','duration','pkg','dram','socket']
        
        desired_data = str(data) + '_' + str(k)
        df = df.loc[df['label'] == desired_data]
        
        pkg = df['pkg']
        dram = df['dram']
        pkg *= (10**-6)
        dram *= (10**-6)
             
        total = pkg + dram
        values.append(total.to_list())

# combine ML methods in one list
k_3 = values[0] + values[1] + values[2]
k_10 = values[3] + values[4] + values[5]
k_27 = values[6] + values[7] + values[8]

# print lenths and sum to check
print(len(k_3), len(k_10), len(k_27))
print(sum(k_3), sum(k_10), sum(k_27))

30 30 30
487.68262599999997 459.49464299999994 464.17840199999995


### Then for synthetic data

In [14]:
values = []

for model in models:
    df = pd.read_csv('../Synthetic_data/Energy/Energy_' + str(model) + '.csv')
    df.columns = ['label','timestamp','duration','pkg','dram','socket']

    desired_data = str(data)
    df = df.loc[df['label'] == desired_data]

    pkg = df['pkg']
    dram = df['dram']
    pkg *= (10**-6)
    dram *= (10**-6)

    total = pkg + dram
    values.append(total.to_list())

# combine ML methods in one list
synth = values[0] + values[1] + values[2]

# print lenths and sum to check
print(len(synth), sum(synth))

60 1291.451949


### Finally, the Mann-Whitney U test itself for the Student Performance set

In [20]:
k3_vs_k10 = mannwhitneyu(k_3, k_10, method="exact", alternative="greater")
k3_vs_k27 = mannwhitneyu(k_3, k_27, method="exact", alternative="greater")
k3_vs_synth = mannwhitneyu(k_3, synth, method="exact", alternative="greater")

k10_vs_k27 = mannwhitneyu(k_10, k_27, method="exact", alternative="greater")
k10_vs_synth = mannwhitneyu(k_10, synth, method="exact", alternative="greater")

k27_vs_synth = mannwhitneyu(k_27, synth, method="exact", alternative="greater")

print("\nk=3 vs 10, 27, synth")
print(k3_vs_k10)
print(k3_vs_k27)
print(k3_vs_synth)

print("\nk=10 vs 27, synth")
print(k10_vs_k27)
print(k10_vs_synth)

print("\nk=27 vs synth")
print(k27_vs_synth)


k=3 vs 10, 27, synth
MannwhitneyuResult(statistic=648.0, pvalue=0.001517341923053091)
MannwhitneyuResult(statistic=662.0, pvalue=0.0007216102699375835)
MannwhitneyuResult(statistic=413.0, pvalue=0.9999917646288415)

k=10 vs 27, synth
MannwhitneyuResult(statistic=609.0, pvalue=0.009165849344885305)
MannwhitneyuResult(statistic=402.0, pvalue=0.9999949254716267)

k=27 vs synth
MannwhitneyuResult(statistic=402.0, pvalue=0.9999949254716267)
