# Compute correlation between variables
Date: 28/02/2023

In [14]:
import pandas as pd # for data manipulation 
import numpy as np
import os, sys, glob, math, pickle
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn import preprocessing

# This function helps to calculate probability distribution, which goes into BBN (note, can handle up to 2 parents)
def cpt_probs(df, child, parents):
    try:
        # dependencies_arr = [pd.Categorical(df[parent],categories=df[parent].cat.categories.tolist()) for parent in parents]
        dependencies_arr = [df[parent] for parent in parents]
        # cpt = pd.crosstab(dependencies_arr, df[child], rownames=parents, colnames=[child], margins=False, normalize='index', dropna=False).sort_index().to_numpy().reshape(-1).tolist()
        cpt = pd.crosstab(dependencies_arr, df[child], rownames=parents, colnames=[child], margins=False, normalize='index', dropna=False).sort_index()
        return cpt
    except Exception as err:
        print(err)
        return None 

def cpt_probs_freq(df, child, parents):
    try:
        # dependencies_arr = [pd.Categorical(df[parent],categories=df[parent].cat.categories.tolist()) for parent in parents]
        dependencies_arr = [df[parent] for parent in parents]
        # cpt = pd.crosstab(dependencies_arr, df[child], rownames=parents, colnames=[child], margins=False, normalize='index', dropna=False).sort_index().to_numpy().reshape(-1).tolist()
        cpt = pd.crosstab(dependencies_arr, df[child], rownames=parents, colnames=[child], margins=False, dropna=False).sort_index()
        return cpt
    except Exception as err:
        print(err)
        return None 

# Load classes_df for later parts (if previous part not run)
classes_df = pd.read_hdf("/media/research-student/One Touch/FANET Datasets/Dataset_NP10000_BPSK_6-5Mbps/classes_df_downlink.h5", 'Downlink')

In [2]:
df = pd.read_hdf("/home/research-student/omnetpp_sim_results/Test_Dataset_BPSK_6-5Mbps/Taguchi_Test_Cases_downlink.h5", "Downlink")
sinr_bins = np.load("/home/research-student/omnet-fanet/cpt/Downlink/sinr_bins_dl.npy")
ber_bins = np.load("/home/research-student/omnet-fanet/cpt/Downlink/ber_bins_dl.npy")
delay_bins = np.load("/home/research-student/omnet-fanet/cpt/Downlink/delay_bins_dl.npy")
queueing_time_bins = np.load("/home/research-student/omnet-fanet/cpt/Downlink/queueing_time_bins_dl.npy")
throughput_bins = np.load("/home/research-student/omnet-fanet/cpt/Downlink/throughput_bins_dl.npy")
jitter_bins = np.load("/home/research-student/omnet-fanet/cpt/Downlink/jitter_bins_dl.npy")
delay_threshold = 0.04

df = df[df['U2G_SINR'].notna()] # Filter out rows with missing crucial information
classes_df = pd.DataFrame() # Created an empty df to store classes data to reduce size of df that need to work with
# First, discretise the values to classes
h_dist_labels = ['vs','s','m','l','vl']
height_labels = ['vs','s','m','l','vl']
num_members_labels = ['vs','s','m','l','vl']
sending_interval_labels = ['vs','s','m','l','vl']
pkt_size_labels = ['vs','s','m','l','vl']
sinr_labels = ['vs','s','m','l','vl']
delay_labels = ['vs','s','m','l','vl']
throughput_labels = ['s','m','l']
queueing_labels = ['s','m','l']
ber_labels = ['vs','s','m','l','vl']
jitter_labels = ['s','m','l']

# Independent vars
classes_df["H_Dist_Class"] = pd.cut(df.U2G_H_Dist, [0,100,200,300,400,501], right=False, include_lowest=True, labels=h_dist_labels)
classes_df["Height_Class"] = pd.cut(df.Height, [1,25,49,73,97,121], right=False, include_lowest=True, labels=height_labels)
classes_df["Num_Members_Class"] = pd.cut(df.Num_Members, [2,8,16,24,32,40], right=False, include_lowest=True, labels=num_members_labels)
classes_df["Sending_Interval_Class"] = pd.cut(df.Mean_Sending_Interval, [40,232,424,616,808,1000], right=False, include_lowest=True, labels=sending_interval_labels)
classes_df["Packet_Size_Class"] = pd.cut(df.Bytes, [24,248,472,696,920,1144], right=False, include_lowest=True, labels=pkt_size_labels)
# Second layer
classes_df["SINR_Class"] = pd.cut(df.U2G_SINR, sinr_bins, right=False, include_lowest=True, labels=sinr_labels)
classes_df["Delay_Class"] = pd.cut(df.Delay, delay_bins, right=False, include_lowest=True, labels=delay_labels)
classes_df["Throughput_Class"] = pd.cut(df.Throughput, throughput_bins, right=False, include_lowest=True, labels=throughput_labels)
classes_df["Queueing_Time_Class"] = pd.cut(df.Queueing_Time, queueing_time_bins, right=False, include_lowest=True, labels=queueing_labels)
classes_df["BER_Class"] = pd.cut(df.U2G_BER, ber_bins, right=False, include_lowest=True, labels=ber_labels)
classes_df["Jitter_Class"] = pd.cut(df.Jitter, jitter_bins, right=False, include_lowest=True, labels=jitter_labels)
# Output vars
classes_df["Reliable"] = (df["Packet_State"] == "Reliable")
classes_df["Delay_Exceeded"] = (df["Delay"] >= delay_threshold)
classes_df["Incorrectly_Received"] = df["Incorrectly_Received"]
classes_df["Queue_Overflow"] = df["Queue_Overflow"]

## Numeric Label Encoding

In [15]:
label_encoder = preprocessing.LabelEncoder()

sinr_label = label_encoder.fit_transform(classes_df["SINR_Class"])
h_dist_label = label_encoder.fit_transform(classes_df["H_Dist_Class"])
height_label = label_encoder.fit_transform(classes_df["Height_Class"])
num_members_label = label_encoder.fit_transform(classes_df["Num_Members_Class"])
sending_interval_label = label_encoder.fit_transform(classes_df["Sending_Interval_Class"])
packet_size_label = label_encoder.fit_transform(classes_df["Packet_Size_Class"])
ber_label = label_encoder.fit_transform(classes_df["BER_Class"])
delay_label = label_encoder.fit_transform(classes_df["Delay_Class"])
incr_rcvd_label = label_encoder.fit_transform(classes_df["Incorrectly_Received"])
delay_excd_label = label_encoder.fit_transform(classes_df["Delay_Exceeded"])
q_overflow_label = label_encoder.fit_transform(classes_df["Queue_Overflow"])
reliability_label = label_encoder.fit_transform(classes_df["Reliable"])

## SINR Parents Correlation

In [4]:
mi_hdist_sinr = mutual_info_classif(h_dist_label.reshape(-1,1), sinr_label.reshape(-1,1))
rho_hdist_sinr = stats.spearmanr(h_dist_label, sinr_label)
print("H_Dist to SINR - MI: {}, Spearman: {}".format(mi_hdist_sinr, rho_hdist_sinr))
mi_height_sinr = mutual_info_classif(height_label.reshape(-1,1), sinr_label.reshape(-1,1))
rho_height_sinr = stats.spearmanr(height_label, sinr_label)
print("Height to SINR - MI: {}, Spearman: {}".format(mi_height_sinr, rho_height_sinr))
mi_num_members_sinr = mutual_info_classif(num_members_label.reshape(-1,1), sinr_label.reshape(-1,1))
rho_num_members_sinr = stats.spearmanr(num_members_label, sinr_label)
print("num_members to SINR - MI: {}, Spearman: {}".format(mi_num_members_sinr, rho_num_members_sinr))
mi_sending_interval_sinr = mutual_info_classif(sending_interval_label.reshape(-1,1), sinr_label.reshape(-1,1))
rho_sending_interval_sinr = stats.spearmanr(sending_interval_label, sinr_label)
print("sending_interval to SINR - MI: {}, Spearman: {}".format(mi_sending_interval_sinr, rho_sending_interval_sinr))
mi_packet_size_sinr = mutual_info_classif(packet_size_label.reshape(-1,1), sinr_label.reshape(-1,1))
rho_packet_size_sinr = stats.spearmanr(packet_size_label, sinr_label)
print("packet_size to SINR - MI: {}, Spearman: {}".format(mi_packet_size_sinr, rho_packet_size_sinr))

  y = column_or_1d(y, warn=True)


H_Dist - MI: [0.18276803], Spearman: SpearmanrResult(correlation=0.13268623971959298, pvalue=0.0)


  y = column_or_1d(y, warn=True)


Height - MI: [0.03522527], Spearman: SpearmanrResult(correlation=-0.008745487945455052, pvalue=2.083896190466109e-25)


  y = column_or_1d(y, warn=True)


num_members - MI: [0.02762724], Spearman: SpearmanrResult(correlation=-0.013280269195818253, pvalue=2.3302788492987965e-56)


  y = column_or_1d(y, warn=True)


sending_interval - MI: [0.02764631], Spearman: SpearmanrResult(correlation=0.009657167547077418, pvalue=1.2819112752507408e-30)


  y = column_or_1d(y, warn=True)


packet_size - MI: [0.02765075], Spearman: SpearmanrResult(correlation=-0.0008612132801843746, pvalue=0.30500882502533744)


  y = column_or_1d(y, warn=True)


packet_size - MI: [0.66961138], Spearman: SpearmanrResult(correlation=0.11642402538876358, pvalue=0.0)


## BER Parents Correlation

In [9]:
mi_sinr_ber = mutual_info_classif(sinr_label.reshape(-1,1), ber_label.reshape(-1,1))
rho_sinr_ber = stats.spearmanr(sinr_label, ber_label)
print("SINR to BER - MI: {}, Spearman: {}".format(mi_sinr_ber, rho_sinr_ber))

  y = column_or_1d(y, warn=True)


BER to SINR - MI: [0.46439216], Spearman: SpearmanrResult(correlation=0.11642402538876358, pvalue=0.0)


## Delay Parents Correlation

In [13]:
mi_ber_delay = mutual_info_classif(ber_label.reshape(-1,1), delay_label.reshape(-1,1))
rho_ber_delay = stats.spearmanr(ber_label, delay_label)
print("BER to Delay - MI: {}, Spearman: {}".format(mi_ber_delay, rho_ber_delay))
mi_num_members_delay = mutual_info_classif(num_members_label.reshape(-1,1), delay_label.reshape(-1,1))
rho_num_members_delay = stats.spearmanr(num_members_label, delay_label)
print("No. UAVs to Delay - MI: {}, Spearman: {}".format(mi_num_members_delay, rho_num_members_delay))
mi_packet_size_delay = mutual_info_classif(packet_size_label.reshape(-1,1), delay_label.reshape(-1,1))
rho_packet_size_delay = stats.spearmanr(packet_size_label, delay_label)
print("Packet Size to Delay - MI: {}, Spearman: {}".format(mi_packet_size_delay, rho_packet_size_delay))
mi_sending_interval_delay = mutual_info_classif(sending_interval_label.reshape(-1,1), delay_label.reshape(-1,1))
rho_sending_interval_delay = stats.spearmanr(sending_interval_label, delay_label)
print("Sending Interval to Delay - MI: {}, Spearman: {}".format(mi_sending_interval_delay, rho_sending_interval_delay))

  y = column_or_1d(y, warn=True)


SINR to Delay - MI: [0.09227339], Spearman: SpearmanrResult(correlation=-0.31017654001589723, pvalue=0.0)


  y = column_or_1d(y, warn=True)


No. UAVs to Delay - MI: [0.03464009], Spearman: SpearmanrResult(correlation=-0.035806616387445554, pvalue=0.0)


  y = column_or_1d(y, warn=True)


Packet Size to Delay - MI: [0.04769863], Spearman: SpearmanrResult(correlation=0.047114495826077726, pvalue=0.0)


  y = column_or_1d(y, warn=True)


Sending Interval to Delay - MI: [0.03313827], Spearman: SpearmanrResult(correlation=0.0027678860846986984, pvalue=0.0009782556900898795)


## Incorrect Rcvd Parents Correlation

In [None]:
mi_ber_incr_rcvd = mutual_info_classif(ber_label.reshape(-1,1), incr_rcvd_label.reshape(-1,1))
rho_ber_incr_rcvd = stats.spearmanr(ber_label, incr_rcvd_label)
print("BER to Incorrect Rcvd - MI: {}, Spearman: {}".format(mi_ber_incr_rcvd, rho_ber_incr_rcvd))
mi_delay_incr_rcvd = mutual_info_classif(delay_label.reshape(-1,1), incr_rcvd_label.reshape(-1,1))
rho_delay_incr_rcvd = stats.spearmanr(delay_label, incr_rcvd_label)
print("Delay to Incorrect Rcvd - MI: {}, Spearman: {}".format(mi_delay_incr_rcvd, rho_delay_incr_rcvd))

## Delay Exceeded Parents Correlation

In [None]:
mi_ber_delay_excd = mutual_info_classif(ber_label.reshape(-1,1), delay_excd_label.reshape(-1,1))
rho_ber_delay_excd = stats.spearmanr(ber_label, delay_excd_label)
print("BER to Delay Excd - MI: {}, Spearman: {}".format(mi_ber_delay_excd, rho_ber_delay_excd))
mi_delay_delay_excd = mutual_info_classif(delay_label.reshape(-1,1), delay_excd_label.reshape(-1,1))
rho_delay_delay_excd = stats.spearmanr(delay_label, delay_excd_label)
print("Delay to Delay Excd - MI: {}, Spearman: {}".format(mi_delay_delay_excd, rho_delay_delay_excd))

## Queue Overflow Parents Correlation

In [None]:
mi_ber_q_overflow = mutual_info_classif(ber_label.reshape(-1,1), q_overflow_label.reshape(-1,1))
rho_ber_q_overflow = stats.spearmanr(ber_label, q_overflow_label)
print("BER to Q Overflow - MI: {}, Spearman: {}".format(mi_ber_q_overflow, rho_ber_q_overflow))
mi_delay_q_overflow = mutual_info_classif(delay_label.reshape(-1,1), q_overflow_label.reshape(-1,1))
rho_delay_q_overflow = stats.spearmanr(delay_label, q_overflow_label)
print("Delay to Q Overflow - MI: {}, Spearman: {}".format(mi_delay_q_overflow, rho_delay_q_overflow))
mi_num_members_q_overflow = mutual_info_classif(num_members_label.reshape(-1,1), q_overflow_label.reshape(-1,1))
rho_num_members_q_overflow = stats.spearmanr(num_members_label, q_overflow_label)
print("No. UAVs to Q Overflow - MI: {}, Spearman: {}".format(mi_num_members_q_overflow, rho_num_members_q_overflow))
mi_packet_size_q_overflow = mutual_info_classif(packet_size_label.reshape(-1,1), q_overflow_label.reshape(-1,1))
rho_packet_size_q_overflow = stats.spearmanr(packet_size_label, q_overflow_label)
print("Packet Size to Q Overflow - MI: {}, Spearman: {}".format(mi_packet_size_q_overflow, rho_packet_size_q_overflow))
mi_sending_interval_q_overflow = mutual_info_classif(sending_interval_label.reshape(-1,1), q_overflow_label.reshape(-1,1))
rho_sending_interval_q_overflow = stats.spearmanr(sending_interval_label, q_overflow_label)
print("Sending Interval to Q Overflow - MI: {}, Spearman: {}".format(mi_sending_interval_q_overflow, rho_sending_interval_q_overflow))

## Extras

In [None]:
print(mutual_info_regression(df.U2G_H_Dist.values.reshape(-1,1), df.U2G_SINR.values.reshape(-1,1)))

In [None]:
stats.chi2_contingency(pd.crosstab(classes_df["Sending_Interval_Class"], classes_df["SINR_Class"]))