# Welch T test

see https://en.wikipedia.org/wiki/Welch%27s_t-test

- (H0) the configurations with yes values and the configs with other values have the same linux kernel size average
- (HA) There is a significant difference between the averages

Accept H0 => this feature should not have any influence on the kernel size
Rejecting H0 => this feature could change the kernel size

### Imports

In [1]:
import pandas as pd
import numpy as np
import tuxml
import scipy.stats

In [2]:
df = tuxml.load_dataset()

### Test if the average size of the linux kernel is bigger with or without yes values for a given feature

In [3]:
name_feature = "CC_OPTIMIZE_FOR_SIZE"
vmlinux_values =  df['vmlinux']
alpha = 0.05

def compute_wtest(name_feature, alpha = 0.05):
    
    # input:
    # the name of the feature
    # output:
    # True if the average of vmlinux with yes values of the feature is significatively 
    # different of the average with no/module values
    # False otherwise
    
    feature_values = df[name_feature]
    
    yes_values = vmlinux_values[feature_values==1]
    no_mod_values = vmlinux_values[feature_values!=1]

    return scipy.stats.ttest_ind(yes_values, no_mod_values)[1] < alpha

def compute_advanced_wtest(name_feature, alpha = 0.05):
    
    # input:
    # the name of the feature
    # output:
    # 2 booleans: first
    # True if the average of vmlinux with yes values of the feature is significatively 
    # different of the average with no/module values
    # False otherwise
    # second:
    # true if yes values have a bigger size than others values
    # else false
    
    feature_values = df[name_feature]
    
    yes_values = vmlinux_values[feature_values==1]
    no_mod_values = vmlinux_values[feature_values!=1]

    yes_avg = np.mean(yes_values)
    no_mod_avg = np.mean(no_mod_values)
    
    return (scipy.stats.ttest_ind(yes_values, no_mod_values)[1] < alpha, yes_avg > no_mod_avg)

print("For", name_feature, 
      ", the average of vmlinux with yes values is significatively (", alpha*100, 
      "% error) different of the average with no/module values :",
      compute_wtest(name_feature, alpha))

print("For", name_feature, 
      " the average of vmlinux with yes values is significatively (", alpha*100, 
      "% error) superior to the average with no/module values :",
      compute_advanced_wtest(name_feature)[0] and compute_advanced_wtest(name_feature)[1])


For CC_OPTIMIZE_FOR_SIZE , the average of vmlinux with yes values is significatively ( 5.0 % error) different of the average with no/module values : True
For CC_OPTIMIZE_FOR_SIZE  the average of vmlinux with yes values is significatively ( 5.0 % error) superior to the average with no/module values : False


if the test rejects

In [4]:
quanti = ["vmlinux", "GZIP-bzImage", "GZIP-vmlinux", "GZIP", "BZIP2-bzImage", 
              "BZIP2-vmlinux", "BZIP2", "LZMA-bzImage", "LZMA-vmlinux", "LZMA", "XZ-bzImage", "XZ-vmlinux", "XZ", 
              "LZO-bzImage", "LZO-vmlinux", "LZO", "LZ4-bzImage", "LZ4-vmlinux", "LZ4", 
         'cid', 'nbno', 'nbyes', 'nbmodule', 'nbyesmodule']

features = df.columns

alpha = 1e-120 # we choose a very small value of alpha so we get a small sample of features
# with 0.05, we keep more than 8000 configurations

different = [] # are the average different?
supe = [] # do the yes values have a bigger vmlinux than the no/module

for f in features:
    if f not in quanti:
        d, s = compute_advanced_wtest(f, alpha)
        different.append(d)
        supe.append(s)
    else:
        different.append('na')
        supe.append('na')

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [5]:
res = pd.DataFrame({'features' : features, "influent_feature" : different, "yes_bigger_size" : supe})
len(np.where(res["influent_feature"]==True)[0])

1388

In [6]:
sample = res[res["influent_feature"]==True]
sample[['features', 'yes_bigger_size']].to_csv("welch_test_output.csv")
sample

Unnamed: 0,features,influent_feature,yes_bigger_size
1,OPENVSWITCH,True,True
7,NFC_HCI,True,True
11,NET_MPLS_GSO,True,True
19,NFC_MEI_PHY,True,True
20,INTEL_MEI,True,True
25,NFC_SIM,True,True
32,VSOCKETS,True,True
36,VMWARE_VMCI_VSOCKETS,True,True
41,VMWARE_VMCI,True,True
45,VIRTIO_VSOCKETS,True,True
