In [None]:
import csv
from random import shuffle
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
import numpy as np

# Cleaning and extraction of data

### global functions
We tried them at the first place on our model but unfortunaltelly (without any surprise) the extracted data isn't correlated with the zone of the user

In [None]:
def total_time(data_sample):
    #returns total time of one user request conversation
    return float(data_sample[-1][1])

def total_len(data_sample):
    #return total number of packages exchanged
    return len(data_sample)

def percentage_tcp(data_sample):
    #returns percentage of tcp protocol was used
    return sum([1 if(q[4]=='TCP') else 0 for q in data_sample])/len(data_sample)

def percentage_tls(data_sample):
    #returns percentage of tcp protocol was used
    return sum([1 if(q[4]=='TLSv1.2') else 0 for q in data_sample])/len(data_sample)

def avg_response_time(data_sample):
    t = [float(q[1]) for q in data_sample]
    return sum([j-i for i, j in zip(t[:-1], t[1:])])/(len(t)-1)
    

After trying basic functions, we did some test on the data to see what noise we could remove etc.

In [None]:
set_source = set()
set_dest = set()
set_pairs = set()

#In this cell we looked at the ip address

for zone_number in range(1, 101):
    for client_number in range(1, 11):
        sample_name = str(zone_number)+'_sample_'+str(client_number)+'.csv'

        with open(sample_name, newline='') as sample_csv:
    
            sample = list(csv.reader(sample_csv))[1:]

            set_source.update([q[2] for q in sample])
            set_dest.update([q[3] for q in sample])
            set_pairs.update([(q[2], q[3])for q in sample])
    

In [None]:
print(set_pairs)

We saw that there are a very little set of used ip addresses, so we tried to find which could be more important/interesting than other. We saw that the most important (as well as used) are communication between 

- '10.0.2.15' and '54.93.77.70'
- '10.0.2.15' and '62.210.85.178'

What is interesting to notice is that there is no communication at all between '62.210.85.178' and '54.93.77.70'. Also, there is generally more messages between '10.0.2.15' and '62.210.85.178': we could use it to find if it is because of the impact of the zone. 

In [None]:
def ip_10_54(sample_data):
    #return all communication between '10.0.2.15' and '54.93.77.70' 
    return [q[1:] for q in sample_data if((q[2]=='10.0.2.15' and q[3]=='54.93.77.70') or 
            (q[2]=='54.93.77.70' and q[3]=='10.0.2.15'))]

def ip_10_62(sample_data):
    #return all communication between '10.0.2.15' and '62.210.85.178'
    return [q[1:] for q in sample_data if((q[2]=='10.0.2.15' and q[3]=='62.210.85.178') or 
                                      (q[2]=='62.210.85.178' and q[3]=='10.0.2.15'))] 

What we found out is that there is two type of messages in a communication: TCP messages and TLS messages. So we now want to try to separate them and see if we can do something about them. TLS messages have always "application data" information, which mean that those are messages from the application, that carries user request and server's answer. We found out that there is more TCP messages than TLS messages, so we will try to use this information as a feature as well.

In [None]:
def remove_tls(sample_data):
    #return only tcp messages. Sample data should be data returned by ip_10_54 or ip_10_62 functions
    return [q for q in sample_data if(q[3]!='TLSv1.2')]

def remove_tcp(sample_data):
    #return only tls messages. Sample data should be data returned by ip_10_54 or ip_10_62 functions
    return [q for q in sample_data if(q[3]!='TCP')]

There is another column we need to clean and extract some features: the information columns when it's a TCP message. First thing we do with them is to delete all samples concerning TCP retransmission and duplicate ack: we are sure they don't concern our problematic. 

In [None]:
def remove_tcp_ret(sample_data):
    #Return TCP messages without TCP retransmission and without duplicate ACK messages.
    #Sample data should be the output of remove_tls function
    #the output will be a list of following features [time, src, dest, prot, len, info]
    
    return [q for q in sample_data if(q[5].split()[0]!='[TCP')]

In [None]:
def info_to_list(info):
    #Take the info string as input: it has only 3 patterns of them
    #1st pattern : '35978  >  9001 [ACK] Seq=99468683 Ack=5554741 Win=65535 Len=0'
    #2nd pattern : '35978  >  9001 [ACK] Seq=99479995 Ack=5555813 Win=65535 Len=2920 [TCP segment of a reassembled PDU]'
    #3rd pattern : '9001  >  34692 [PSH, ACK] Seq=1073 Ack=2101 Win=65535 Len=1448 [TCP segment of a reassembled PDU]'
    # return list : [start, end, seq_number, ack_number, win_len, len]
    #if this is a reassembled PDU segment, len!=0, otherwise len=0
    
    l = info.split()
    
    
    start = int(l[0])
    end = int(l[2])
    
    if(l[3][1:2]=='P'):
        seq_number= int(l[5][4:])
        ack_number, win_len, len_seg = int(l[6][4:]), int(l[6][4:]), int(l[8][4:])
    else:
        seq_number= int(l[4][4:])
        ack_number, win_len, len_seg = int(l[5][4:]), int(l[6][4:]), int(l[7][4:])
    
    return [start, end, seq_number, ack_number, win_len, len_seg]
    
    

Now the features we can extract from the TCP information column are:

- average of the window size
- what is the freqency of reassembled PDU messages

We though that seq_number isn't pertinent for our analyze of the data, as well as the start and the end

In [None]:
def avg_window_size(sample_data):
    #the input is a list of [time, src_ip, dst_ip, length, start, end, seq_number, ack_number, win_len, len_seg]
    #every line is a TCP communication
    #return the average size of the window
    return sum([q[8] for q in sample_data])/len(sample_data)
    
def avg_pdu_ass(sample_data):
    #the input is a list of [time, src_ip, dst_ip, length, start, end, seq_number, ack_number, win_len, len_seg]
    #every line is a TCP communication
    #The function computes the average frequency of reassembled PDU messages: 
    #if the reassamble PDU message is the fourth one (-, -, -, reassembled_packet) => the function will return 1/3
    
    p = [[1, 0] if(q[9]==0) else [0, 1] for q in sample_data]
    
    res = [sum(x) for x in zip(*p)]
    return res[1]/res[0]
    
    

# creating data and the model
Now we will assemble all function defined in previous part and train our model on it

In [None]:
#data creation
data = []

for zone_number in range(1, 101):
    for client_number in range(1, 11):
        sample_name = str(zone_number)+'_sample_'+str(client_number)+'.csv'

        with open(sample_name, newline='') as sample_csv:
    
            sample = list(csv.reader(sample_csv))[1:]
        
            '''
            Our features would be:
            - total time, total_len
            - fraction 10_62packets/tot_len
            - fraction 10_54packets/tot_len
            - avg window size for 10_62
            - avg pdu_assemblage for 10_62
            - avg window size for 10_54
            - avg pdu_assemblage for 10_54
            
            - total tls packets 10_62
            - total tcp packets 10_62
            - total tls packets 10_54
            - total tcp packets 10_54
            
            - frac tls packets 10_62/ tot packets 10_62
            - frac tls packets 10_54/ tot packets 10_54
            
            '''
            tot_len = total_len(sample)
            
            list_10_62 = ip_10_62(sample)
            list_10_54 = ip_10_54(sample)
            
            frac_10_62 = len(list_10_62)/tot_len
            frac_10_54 = len(list_10_54)/tot_len
            
            tcp_10_62 = remove_tcp_ret(remove_tls(list_10_62))
            tcp_10_54 = remove_tcp_ret(remove_tls(list_10_54))

            
            #process the last string
            tcp_10_62 = [q[:3]+[q[4]]+info_to_list(q[5]) for q in tcp_10_62]
            tcp_10_54 = [q[:3]+[q[4]]+info_to_list(q[5]) for q in tcp_10_54]
            
            avg_wdw_10_62 = avg_window_size(tcp_10_62)
            avg_pdu_10_62 = avg_pdu_ass(tcp_10_62)
            
            avg_wdw_10_54 = avg_window_size(tcp_10_54)
            avg_pdu_10_54 = avg_pdu_ass(tcp_10_54)
            
            #tot tls/tcp packets
            tot_tcp_10_62_len = len(tcp_10_62)
            tot_tcp_10_54_len = len(tcp_10_54)
            
            tls_10_62 = remove_tcp(list_10_62)
            tls_10_54 = remove_tcp(list_10_54)
            
            tot_tls_10_62_len = len(tls_10_62)
            tot_tls_10_54_len = len(tls_10_54)
            
            #frac tls packets / tot packets 
            frac_tls_10_62 = tot_tls_10_62_len/len(list_10_62)
            frac_tls_10_54 = tot_tls_10_54_len/len(list_10_54)
            
            
            cleaned = [total_time(sample), tot_len, frac_10_62, frac_10_54, 
                       avg_wdw_10_62, avg_pdu_10_62, avg_wdw_10_54, avg_pdu_10_54,
                       tot_tcp_10_62_len, tot_tcp_10_54_len, tot_tls_10_62_len, tot_tls_10_54_len, 
                       frac_tls_10_62, frac_tls_10_54, zone_number-1]
            
            data.append(cleaned)

In [None]:
#here we shuffle the data and create X and Y arrays: X is the data and Y is what is the correct answer of the zone
shuffle(data)

X, Y = np.array([q[:14] for q in data]), np.array([q[14:][0] for q in data])



In [None]:
#mode creation

def create_model():
    
    inputs = tf.keras.Input(shape=(14,))
    i1 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(inputs)
    i2 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i1)
    i3 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i2)
    i4 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i3)
    i5 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i4)
    i6 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i5)
    i7 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i6)
    i8 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i7)
    i9 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i8)
    i10 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i9)
    i11 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i10)
    i12 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i11)
    i13 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i12)
    i14 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i13)
    i15 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i14)
    i16 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i15)
    i17 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i16)
    i18 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i17)
    i19 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i18)
    i20 = tf.keras.layers.Dense(14, activation=tf.nn.relu)(i19)

    
    outputs = tf.keras.layers.Dense(1)(i20)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
 
    model.compile(loss = "mean_squared_error" , optimizer = 'adam' , metrics = ['accuracy'] )
 
    return model

In [None]:
def compute_accuracy(pred, truth):
    #pred is predictions returned by the model, numbers are not rounded
    #truth is all correct zones
    #this function return the accuracy of the predictions
    pred_rounded = [round(z[0]) for z in list(pred)]
    
    zipped = zip(list(truth), pred_rounded)
    correct_guessed = [1 if(z[0]==z[1]) else 0 for z in zipped]
    
    return sum(correct_guessed)/100
    

In [None]:
#we compute here the accuracy for a n_split=10 k-fold cross validation model
n_split=10

evals= list()
acc = list()
 
for train_index,test_index in KFold(n_split).split(X):
    x_train,x_test=X[train_index],X[test_index]
    y_train,y_test=Y[train_index],Y[test_index]
  
    model=create_model()
    model.fit(x_train, y_train, epochs=2000)
    
    pred = model.predict(x_test)
    a = compute_accuracy(pred, y_test)
    
    print("Accuracy: ", a)
    acc.append(a)


In [None]:
for i in acc:
    print(i)

accuracy_avg = sum(acc)/10
print("Our final accuracy is: ", accuracy_avg)