# Laboratorio 5: Análisis de Red

## Parte 1: Análisis de Paquetes

### **Análisis Estadístico**

In [1]:
# Imports
from scapy.all import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
# Lectura de paquetes recurrentes y unificación de .pcap preexistente 
conf.promisc = True
# Cargar el archivo pcap existente
paquetes_pcap = rdpcap('analisis_paquetes.pcap')
payload = paquetes_pcap[0].load
# Capturar 10 paquetes en la interfaz de red "Wi-Fi"
conf.promisc = True
paquetes_capturados = sniff(count=10, iface="Wi-Fi")

# Combinar los paquetes capturados y los del archivo pcap en una lista
paquetes_total = PacketList(paquetes_capturados + paquetes_pcap)

# Crear el DataFrame con las columnas solicitadas
df = pd.DataFrame({'Src Address': [p[IP].src for p in paquetes_total if IP in p],
                   'Dst Address': [p[IP].dst for p in paquetes_total if IP in p],
                   'Src Port': [p[TCP].sport if TCP in p else p[UDP].sport for p in paquetes_total if IP in p],
                   'Dst Port': [p[TCP].dport if TCP in p else p[UDP].dport for p in paquetes_total if IP in p],
                   'Payload': [int.from_bytes(p[Raw].load, byteorder='big') if Raw in p else 0 for p in paquetes_total if IP in p]})


In [None]:
df.sample(n=5)
df['Payload'].loc[42]

0

### Estadísticas

In [None]:
# IP Origen más frecuente

ip_mas_frecuente = df['Src Address'].mode()[0]
print("La dirección IP de origen más frecuente es:", ip_mas_frecuente)

La dirección IP de origen más frecuente es: 10.1.10.53


In [None]:
# IP Destino más frecuente

ip_mas_frecuente = df['Dst Address'].mode()[0]
print("La dirección IP de origen más frecuente es:", ip_mas_frecuente)

La dirección IP de origen más frecuente es: 10.1.10.53


In [None]:
# ¿A qué IPs se comunica la IP de Origen más frecuente?

ip_origen_mas_frecuente = df['Src Address'].mode()[0]
ips_destino = df.loc[df['Src Address'] == ip_origen_mas_frecuente, 'Dst Address'].unique()
print("La dirección IP de origen más frecuente es:", ip_origen_mas_frecuente)
print("Se comunica con las siguientes direcciones IP de destino:", ips_destino)

La dirección IP de origen más frecuente es: 10.1.10.53
Se comunica con las siguientes direcciones IP de destino: ['84.54.22.33' '75.75.75.75']


In [None]:
# ¿A qué puertos destino se comunica la IP de Origen más frecuente?

ip_mas_frecuente = df['Src Address'].value_counts().index[0]
df_ip_frecuente = df[df['Src Address'] == ip_mas_frecuente]
puertos_destino_unicos = df_ip_frecuente['Dst Port'].unique()
print('Se comunica con los siguientes puertos',puertos_destino_unicos)

Se comunica con los siguientes puertos [53]


In [None]:
# ¿A qué puertos origen se comunica la IP de Destino más frecuente?

ip_mas_frecuente = df['Dst Address'].value_counts().index[0]
df_ip_frecuente = df[df['Dst Address'] == ip_mas_frecuente]
puertos_origen_unicos = df_ip_frecuente['Src Port'].unique()
print('Se comunica con los siguientes puertos',puertos_origen_unicos)

Se comunica con los siguientes puertos [53]


### Indique el propósito de los puertos que más aparece en los incisos anteriores
<br>
<div style="text-align: justify">
    El puerto <b>53</b> se utiliza comúnmente para el protocolo de sistema de nombres de dominio (DNS), que permite la resolución de nombres de dominio en direcciones IP. Cuando un usuario ingresa una URL en un navegador web, el navegador envía una solicitud DNS al servidor DNS designado para obtener la dirección IP correspondiente del servidor web que aloja la página web solicitada. Esta solicitud se realiza en el puerto 53 del servidor DNS. Por lo tanto, el puerto 53 es esencial para la navegación web y para la comunicación en línea en general.
</div>

Modelos con PCA y Kmeans

Modelos sin PCA

In [17]:
train = pd.read_csv('./datasets/Train_data.csv')

train.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [18]:
test = pd.read_csv('./datasets/Test_data.csv')

test.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,private,REJ,0,0,0,0,0,0,...,255,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0
1,0,tcp,private,REJ,0,0,0,0,0,0,...,255,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,3,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,29,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71


In [19]:
train.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,...,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0,25192.0
mean,305.054104,24330.63,3491.847,7.9e-05,0.023738,4e-05,0.198039,0.001191,0.394768,0.22785,...,182.532074,115.063036,0.519791,0.082539,0.147453,0.031844,0.2858,0.279846,0.1178,0.118769
std,2686.55564,2410805.0,88830.72,0.00891,0.260221,0.0063,2.154202,0.045418,0.488811,10.417352,...,98.993895,110.64685,0.448944,0.187191,0.308367,0.110575,0.445316,0.446075,0.305869,0.317333
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,84.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,61.0,0.51,0.03,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,279.0,530.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0
max,42862.0,381709100.0,5151385.0,1.0,3.0,1.0,77.0,4.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
test.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,...,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0
mean,218.859076,10395.45,2056.019,0.000311,0.008428,0.00071,0.105394,0.021647,0.442202,0.119899,...,193.869411,140.750532,0.608722,0.09054,0.132261,0.019638,0.097814,0.099426,0.233385,0.226683
std,1407.176612,472786.4,21219.3,0.017619,0.142599,0.036473,0.928428,0.150328,0.496659,7.269597,...,94.035663,111.783972,0.435688,0.220717,0.306268,0.085394,0.273139,0.281866,0.387229,0.400875
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,121.0,15.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,54.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,168.0,0.92,0.01,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,287.0,601.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.06,0.03,0.01,0.0,0.0,0.36,0.17
max,57715.0,62825650.0,1345927.0,1.0,3.0,3.0,101.0,4.0,1.0,796.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
#'num_outbound_cmds' is a redundant column so remove it from both train & test datasets
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [22]:
train['class'].value_counts()

normal     13449
anomaly    11743
Name: class, dtype: int64

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# extract numerical attributes and scale it to have zero mean and unit variance  
cols = train.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(train.select_dtypes(include=['float64','int64']))
sc_test = scaler.fit_transform(test.select_dtypes(include=['float64','int64']))

# turn the result back to a dataframe
sc_traindf = pd.DataFrame(sc_train, columns = cols)
sc_testdf = pd.DataFrame(sc_test, columns = cols)

In [24]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# extract categorical attributes from both training and test sets 
cattrain = train.select_dtypes(include=['object']).copy()
cattest = test.select_dtypes(include=['object']).copy()

# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)
testcat = cattest.apply(encoder.fit_transform)

# separate target column from encoded data 
enctrain = traincat.drop(['class'], axis=1)
cat_Ytrain = traincat[['class']].copy()

In [25]:
train_x = pd.concat([sc_traindf,enctrain],axis=1)
train_y = train['class']
train_x.shape

(25192, 40)

In [26]:
test_df = pd.concat([sc_testdf,testcat],axis=1)
test_df.shape

(22544, 40)

In [27]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(train_x,train_y,train_size=0.70, random_state=2)

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, Y_train); 

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, Y_train);
            

In [29]:
from sklearn import metrics

models = []
models.append(('KNeighborsClassifier', KNN_Classifier))
models.append(('LogisticRegression', LGR_Classifier))

for i, v in models:
    scores = cross_val_score(v, X_train, Y_train, cv=10)
    accuracy = metrics.accuracy_score(Y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(Y_train, v.predict(X_train))
    classification = metrics.classification_report(Y_train, v.predict(X_train))
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()



Cross Validation Mean Score:
 0.9914370153431007

Model Accuracy:
 0.9937620505840989

Confusion matrix:
 [[8168   77]
 [  33 9356]]

Classification report:
               precision    recall  f1-score   support

     anomaly       1.00      0.99      0.99      8245
      normal       0.99      1.00      0.99      9389

    accuracy                           0.99     17634
   macro avg       0.99      0.99      0.99     17634
weighted avg       0.99      0.99      0.99     17634




Cross Validation Mean Score:
 0.9537821727291786

Model Accuracy:
 0.9548599296812975

Confusion matrix:
 [[7763  482]
 [ 314 9075]]

Classification report:
               precision    recall  f1-score   support

     anomaly       0.96      0.94      0.95      8245
      normal       0.95      0.97      0.96      9389

    accuracy                           0.95     17634
   macro avg       0.96      0.95      0.95     17634
weighted avg       0.95      0.95      0.95     17634


