In [2]:
import math
import pandas as pd
import cmath
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, auc
from random import randint
import matplotlib.pyplot as plt
from IPython.display import Image,display
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "sans-serif",
    "font.sans-serif": ["Helvetica"]})


In [2]:
interestedIndexes = list(range(-28,0)) + list(range(1,29)) #non null columns

# Function to classify
def classify(df,ylabel="MuStdAmplPaper",gt="Label",plot_roc=True):
    # Y are the labels that indicate if i'm passing or not
    Y = df[gt]
    num_iter = 1000
    # thr is the threshold: if amplitude > thr, then assign to Y_pred 1 (presence), otherwise 0. Every time update the threshold
    thr= df[ylabel].min()
    tpr = []
    fpr= []
    thr_list= []
    step = (df[ylabel].max() - df[ylabel].min()) / num_iter
    while thr <= df[ylabel].max():
        # compute the predictions
        Y_pred = df.apply(lambda row: 1 if row[ylabel] >= thr else 0, axis=1)
        tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()
        # compute True Positive Rate and False Positive rate to plot the roc curve
        tpr.append(tp/(tp+fn))
        fpr.append(fp/(fp+tn))
        thr_list.append(thr)
        thr += step
        
    if plot_roc:
        plt.figure(figsize=(3,3),dpi=220)
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], color = 'green')
        plt.xlim(-0.05, 1.05)
        plt.ylim(-0.05, 1.05)
        plt.grid()
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC curve")
        plt.show()
    
    return auc(fpr, tpr)
    

def checkGT(predicted_true,predicted_false):
    tp,tn,fp,fn = [0,0,0,0]
    
    for t in predicted_true:
        fp = fp+1  #add false positive
        for l in lower_bounds:
            if abs(t-l) <=w2:
                tp = tp+1 #put true positive
                fp = fp-1#remove the old false positive
                break
                
    for t in predicted_false:
        tn= tn+1  #add false positive
        for l in lower_bounds:
            if abs(t-l) <=w2:
                tn = tn-1 #put true neg
                fn = fn+1#remove the old false neg
                
                break
    
    return tp,tn,fp,fn
    
def classify_passage(dataframe, ycol="MuStdAmplPaper",gt="Label",plot_roc=True):
    dfPeaks = pd.DataFrame(columns=["Time",ycol])
    for index, row in dataframe.iterrows():
        if index==0 or index == dataframe.tail(1).index:
            continue
        if row[ycol] >= dataframe.iloc[index-1][ycol] and row[ycol] > dataframe.iloc[index+1][ycol]:
            dfPeaks = dfPeaks.append(row[["Time",ycol]], ignore_index=True)
    #function that returns the ROC plot and the AUC (skipping multiple misprediction)
    tau = min(dfPeaks[ycol])
    num_iter = 1000
    step = (max(dfPeaks[ycol]) - tau) / num_iter
    tpr = []
    fpr = []
    while tau < max(dfPeaks[ycol]):
        ttrues = list(dfPeaks.loc[dfPeaks[ycol]>=tau,"Time"])
        tfalses = list(dfPeaks.loc[dfPeaks[ycol]<tau,"Time"])
        tp,tn,fp,fn = checkGT(ttrues,tfalses)
        tpr.append(tp/(tp+fn))
        fpr.append(fp/(fp+tn))
        #print(fp/(fp+tn),tp/(tp+fn))
        
        tau = tau+step
    
    if(plot_roc):
        # Plot the roc curve
        plt.figure(figsize=(3,3),dpi=220)
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], color = 'green')
        plt.xlim(-0.05, 1.05)
        plt.ylim(-0.05, 1.05)
        plt.grid()
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC curve")
        plt.show()

    #print(auc(fpr, tpr))
    return auc(fpr, tpr)
    

def extractWindowedFeatures(data,column_indexes = [],w2=3):
    data["TimeWindow"] = np.floor(data["Timestamp"] / w2)*w2
    #vertical mean/std
    dataStd = data.groupby(by="TimeWindow").std().drop(["Timestamp","Frame_num"],axis=1)
    #dataMean = data.groupby(by="TimeWindow").mean().drop(["Timestamp","Frame_num"],axis=1)
    
    featuredDf = pd.DataFrame()
    featuredDf["Time"] = data["TimeWindow"].unique()
    #horizontal
    featuredDf["MuStdAmplPaper"] = dataStd[[f"Ampl{j}" for j in column_indexes]].mean(axis=1).reset_index(drop=True) #Axis=1: mean over different columns -> into one col
    return featuredDf

def extractWindowedOptimized(dataStd,column_indexes = interestedIndexes,w2=3):
    featuredDf = pd.DataFrame()
    featuredDf["Time"] = dataStd["Time"].unique()
    #horizontal
    featuredDf["MuStdAmplPaper"] = dataStd[[f"Ampl{j}" for j in column_indexes]].mean(axis=1).reset_index(drop=True) #Axis=1: mean over different columns -> into one col
    return featuredDf

def filterData(df,w1=3,lambda1=3):
    data = df.copy() #clone and return a copy
    interestedIndexes = list(range(-28,0)) + list(range(1,29))
    #w1 = 5 #best=2
    #lambda1 = 3 #best=4
    col_list = [f"Ampl{j}" for j in interestedIndexes]

    for index, row in data.iterrows():
        if index == 0:
            prev_row = row
            continue

        subDf = data.loc[(data["Timestamp"]<=row['Timestamp']) & (data["Timestamp"]> row['Timestamp'] - w1),col_list]
        means = subDf.mean(axis=0)
        stds = subDf.std(axis=0)

        for c in col_list: 
            if (abs(row[c] - means[c]) / stds[c]) > lambda1:
                data.at[index,c] = prev_row[c]
                #row[c] = prev_row[c]

        prev_row = row
    return data

def substituteDf(data,t1,t2,duration,ftr="MuStdAmplPaper"):
    data.loc[(data["Time"] > t1) & (data["Time"] < t1+duration),ftr] = data.loc[(data["Time"] > t2) & (data["Time"] < t2+duration),ftr].values

    
#use as follows: cleanDf = clean_passage_features(featuredDf)
def clean_passage_features(data):
    cleanDf = data.copy()
    t4 = 1254.0
    substituteDf(cleanDf,t4+35,t4+215,30) #clean [30-60]
    substituteDf(cleanDf,t4+95,t4+575,25) #clean [90-120]
    substituteDf(cleanDf,t4+139,t4+129,8) #clean [120-150]
    substituteDf(cleanDf,t4+249,t4+159,10) #clean [240-270]
    substituteDf(cleanDf,t4+275,t4+515,30) #clean [270-300]
    substituteDf(cleanDf,t4+395,t4+305,30) #clean [390-420]
    substituteDf(cleanDf,t4+455,t4+245,10) #clean [450-480]
   
    return cleanDf
    

## Analysis on task2 (passage)

In [73]:
filteredDf = pd.read_csv("csv/filteredPassage.csv")
#display(filteredDf)
w1=5 #for filtering
w2=3 #for windows
lambda1=3
#for ground truth
t2,t3,t4 = [570.0, 873.0, 1254.0] #separation times for gt
gt1 = [120,180,240,300,390,540]
gt2 = [t2+i for i in range(60,300,30)]
gt3 = [t3+30,t3+57] + [t3+i for i in range(90,390,30)]
gt4 = [t4+i for i in range(30,630,30)]
lower_bounds = gt1+gt2+gt3+gt4
upper_bounds = [l + 1 for l in lower_bounds]


In [74]:
filteredDf["TimeWindow"] = np.floor(filteredDf["Timestamp"] / w2)*w2
dataStd = filteredDf.groupby(by="TimeWindow",as_index=True).std().drop(["Timestamp","Frame_num"],axis=1)
dataStd["Time"] = dataStd.index
featuredDf = extractWindowedOptimized(dataStd,column_indexes = interestedIndexes,w2=w2)
data_auc = classify_passage(featuredDf,plot_roc=False)
print(data_auc)

0.9752415458937198


### Sort columns for accuracy

In [None]:
max_cols = len(interestedIndexes)

filteredDf["TimeWindow"] = np.floor(filteredDf["Timestamp"] / w2)*w2
dataStd = filteredDf.groupby(by="TimeWindow",as_index=True).std().drop(["Timestamp","Frame_num"],axis=1)
dataStd["Time"] = dataStd.index

overallDf = pd.DataFrame(columns=["Num_cols","AUC"])


def recursiveColumnSorter(current_list,expansion_list):
    global overallDf
    resDf = pd.DataFrame(columns=["Col_index","AUC"])
    for c in expansion_list:
        featuredDf = extractWindowedOptimized(dataStd,column_indexes = current_list + [c],w2=w2)
        cleanDf = clean_passage_features(featuredDf)
        data_auc = classify_passage(cleanDf,plot_roc=False)
        resDf = resDf.append(pd.Series([c,data_auc],index = ["Col_index","AUC"]), ignore_index=True)
        print("Ok")
    selected_col = int(resDf.loc[resDf["AUC"] == max(resDf["AUC"]),"Col_index"].values[0])
    expansion_list.remove(selected_col)
    current_list.append(selected_col)
    
    overallDf = overallDf.append(pd.Series([len(current_list),max(resDf["AUC"])],index = ["Num_cols","AUC"]), ignore_index=True)
    if len(current_list) >= max_cols:
        return current_list
    return recursiveColumnSorter(current_list,expansion_list)

start_list = []
full_list = interestedIndexes
col_sorted = recursiveColumnSorter(start_list,full_list)

In [6]:
display(overallDf)
col_sorted

Unnamed: 0,Num_cols,AUC
0,1.0,1.0
1,2.0,1.0
2,3.0,1.0
3,4.0,1.0
4,5.0,1.0
5,6.0,1.0
6,7.0,1.0
7,8.0,1.0
8,9.0,1.0
9,10.0,1.0


[-27,
 -7,
 -26,
 -3,
 -25,
 -24,
 -23,
 -22,
 -28,
 -20,
 -21,
 28,
 -19,
 -9,
 -18,
 -17,
 -15,
 -14,
 -1,
 -16,
 -13,
 -12,
 -11,
 -10,
 -8,
 -6,
 -5,
 -4,
 2,
 -2,
 1,
 3,
 4,
 5,
 20,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 22,
 21,
 23,
 24,
 25,
 26,
 27]

In [7]:
overallDf.to_csv("csv/cols_auc_passage_CLEAN.csv")

### ------------------------------------------
#### Analysis on compression (task2)


In [74]:

#ATC compressione sulle feature
def compressFeatured(data,ftr="MuStdAmplPaper",num_bits=16):
    compressedData = data.copy()
    #apply pcm
    compressedData[ftr] = (((compressedData[ftr]- min(compressedData[ftr]) )/ (max(compressedData[ftr]) - min(compressedData[ftr])))* (math.pow(2,num_bits)-1)).apply(np.round)
    return compressedData

#CTA compressione sul dataset originale
def compressAmplPhases(data,num_bits=16,column_indexes = [1],scale="global"):
    compressedData = data.copy()
    maxDf = compressedData.max(axis=0)[[f"Ampl{j}" for j in column_indexes]]
    minDf = compressedData.min(axis=0)[[f"Ampl{j}" for j in column_indexes]]
    
    ftr_list = [f"Ampl{j}" for j in column_indexes]
    for ftr in ftr_list:
        if scale=="global": #min is the global min, max the global max
            compressedData[ftr] = (((compressedData[ftr]- min(minDf)) / (max(maxDf) - min(minDf))) * (math.pow(2,num_bits)-1)).apply(np.round)
        else:
            compressedData[ftr] = (((compressedData[ftr]- minDf[ftr]) / (maxDf[ftr] - minDf[ftr])) * (math.pow(2,num_bits)-1)).apply(np.round)
    return compressedData

def complex_real(complex_value):
    return complex(complex_value).real

def complex_imag(complex_value):
    return complex(complex_value).imag

def complex_rebuild(real,imag):
    return (real + 1j*imag)

#CTA compressione sul dataset originale
def compressXY(data,num_bits=16,column_indexes = [1],scale="global"):
    compressedData = data.copy()
    
    #YOU NEED X AND Y COLUMNS
    for j in column_indexes:
        compressedData[f'X{j}'] = compressedData[f"CSI{j}"].apply(complex_real)
        compressedData[f'Y{j}'] = compressedData[f"CSI{j}"].apply(complex_imag)
        
    maxXDf = compressedData.max(axis=0)[[f"X{j}" for j in column_indexes]]
    maxYDf = compressedData.max(axis=0)[[f"Y{j}" for j in column_indexes]]
    
    minXDf = compressedData.min(axis=0)[[f"X{j}" for j in column_indexes]]
    minYDf = compressedData.min(axis=0)[[f"Y{j}" for j in column_indexes]]
    
    
    for j in column_indexes:
        if scale=="global": #min is the global min, max the global max
            compressedData[f'X{j}'] = (((compressedData[f'X{j}']- min(minXDf)) / (max(maxXDf) - min(minXDf))) * (math.pow(2,num_bits)-1)).apply(np.round)
            compressedData[f'Y{j}'] = (((compressedData[f'Y{j}']- min(minYDf)) / (max(maxYDf) - min(minYDf))) * (math.pow(2,num_bits)-1)).apply(np.round)
            
        else: #min and max are per column 
            compressedData[f'X{j}'] = (((compressedData[f'X{j}']- minXDf[f'X{j}']) / (maxXDf[f'X{j}'] - minXDf[f'X{j}'])) * (math.pow(2,num_bits)-1)).apply(np.round)
            compressedData[f'Y{j}'] = (((compressedData[f'Y{j}']- minYDf[f'Y{j}']) / (maxYDf[f'Y{j}'] - minYDf[f'Y{j}'])) * (math.pow(2,num_bits)-1)).apply(np.round)
        #build_back the csi    
        #compressedData[f'CSI{j}'] = compressedData[[f'X{j}',f'Y{j}']].apply(complex_rebuild)
        compressedData[f'CSI{j}'] = compressedData.apply(lambda x: complex_rebuild(x[f'X{j}'], x[f'Y{j}']), axis=1)
        
        #compute back ampl and phases
        compressedData[f'Ampl{j}'] = compressedData[f'CSI{j}'].apply(abs)
        compressedData[f'Phase{j}'] = compressedData[f'CSI{j}'].apply(cmath.phase)
            
    return compressedData

In [87]:
passage = pd.read_csv("csv/passage.csv")
filteredPassage = pd.read_csv("csv/filteredPassage.csv")
featuresPassage = pd.read_csv("csv/featuresPassage.csv")
filteredFeaturesPassage = pd.read_csv("csv/filteredFeaturesPassage.csv")
#featuredDf = extractWindowedFeatures(filteredDf,column_indexes = interestedIndexes,w2=w2)
orig_auc = classify_passage(filteredFeaturesPassage,plot_roc=False)
print(orig_auc)

0.9752415458937198


In [None]:
# compress ampl and phases AFTER outlier filtering (STEP 3)

filteredPassage = pd.read_csv("csv/filteredPassage.csv")
print("original_auc:",orig_auc)
for n in range(2,33):
    compressedPassage = compressAmplPhases(filteredPassage,num_bits=n,column_indexes = interestedIndexes,scale="global")
    featuredCompressed = extractWindowedFeatures(compressedPassage,column_indexes = interestedIndexes,w2=w2)
    print(n,classify_passage(featuredCompressed,plot_roc=False))

In [90]:
# compress ampl and phases BEFORE outlier filtering (STEP 2)

passage = pd.read_csv("csv/passage.csv")
print("original_auc:",orig_auc)
for n in range(2,20):
    #compress amplitude
    compressedPassageUnfiltered = compressAmplPhases(passage,num_bits=n,column_indexes = interestedIndexes,scale="global")
    #filter
    compressedPassageFiltered = filterData(compressedPassageUnfiltered) 
    #compute features
    featuredCompressed = extractWindowedFeatures(compressedPassageFiltered,column_indexes = interestedIndexes,w2=w2)
    #classify
    print(n,classify_passage(featuredCompressed,plot_roc=False))

original_auc: 0.9752415458937198
2 0.08333333333333331
3 0.6200284090909091
4 0.5607235142118863
5 0.9651820431247791
6 0.9696921612186606
7 0.9675523349436392
8 0.9667374959136974
9 0.9699053534457261
10 0.9695048309178744
11 0.9697476436606872
12 0.9695345988977343
13 0.9695345988977343
14 0.9697476436606872
15 0.9695345988977343
16 0.9697476436606872
17 0.9695345988977343
18 0.9697476436606872
19 0.9697476436606872


In [92]:
# compress X and Y global on UNFILTERED, then filter! (STEP 1)

passage = pd.read_csv("csv/passage.csv")
#display(filteredDf)
w1=5 #for filtering
w2=3 #for windows
lambda1=3
interestedIndexes = list(range(-28,0)) + list(range(1,29)) #non null columns


print("original_auc:",orig_auc)
for n in range(3,20):
    compressed_XY = compressXY(passage,num_bits=n,column_indexes = interestedIndexes,scale="global")
    compressed_filtered = filterData(compressed_XY)
    featuredCompressed = extractWindowedFeatures(compressed_filtered,column_indexes = interestedIndexes,w2=w2)
    print(n,classify_passage(featuredCompressed,plot_roc=False))
    

original_auc: 0.9752415458937198
3 0.36815476190476193
4 0.6465359237536656
5 0.8905109489051095
6 0.9640947968638632
7 0.9710144927536233
8 0.9705745341614906
9 0.9707061362935554
10 0.9708603145235893
11 0.970284641851736
12 0.9694693094629157
13 0.9704192546583852
14 0.970128245229903
15 0.970128245229903
16 0.970128245229903
17 0.970128245229903
18 0.970128245229903
19 0.970128245229903
