In [None]:
import os 
import pandas as pd
import numpy as np 
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

os.chdir("/scratch/dr2de/sorted_conns")

In [None]:
# concat malicous and benign data frames (balanced)
df = pd.concat([df2,df3], ignore_index=True)
df.shape

In [None]:
# drop some columns 
# this takes a little while to run 
drop_col=[ 1, 13, 10, 9]
df = df.drop(drop_col, axis=1)
df = df.rename({1:'ts',2:"src_ip", 3:"src_port", 4:'dest_ip',\
          5:'dest_port', 6:'duration', 7:'src_bytes',\
          8:'dest_bytes', 11:'src_pkts', 12:'dest_pkts',\
          14:'local', 15:'label'}, axis='columns')
df = df.replace('-',0)
df[["src_bytes", "dest_bytes","duration"]] = df[["src_bytes", "dest_bytes","duration"]].apply(pd.to_numeric)
aggs = df.groupby('src_ip') \
            .agg({'src_port':'nunique', 'dest_ip':'nunique', 'dest_port':'nunique', 'src_pkts':['mean', np.median,np.std],
                  'dest_pkts':['mean', np.median,np.std], 'src_bytes':['mean', np.median,np.std], 
                  'dest_bytes':['mean', np.median,np.std], 'duration':['mean', np.median,np.std,'count'], 'label':'mean'}).reset_index()

In [None]:
# rename columns
columns= ['src_ip','dc_src_port','dc_dest_ip','dc_dest_port','src_pkts_mean',
                                               'src_pkts_med', 'src_pkts_std', 'dest_pkts_mean', 'dest_pkts_med', 
                                               'dest_pkts_std', 'src_pkts_mean', 'src_pkts_med', 'src_pkts_std','src_bytes_mean',
                                               'src_bytes_med', 'src_bytes_std', 'duration_mean', 'duration_med',
                                                'duration_std', 'duration_count','label']
aggs.columns = columns
aggs.label.sum()

In [None]:
# sort by label, rebalance based on aggregation, drop source IP, replace NaNs
# will need to change 6000 value here
aggs = aggs.sort_values(by=['label'], ascending = False)
df_s = aggs.head(6000)
df_s.drop(['src_ip'], axis=1, inplace = True)
df_s = df_s.replace(np.NaN,0)
df_s.shape

In [None]:
#Standard scaling 
scaler = StandardScaler()
col= ['dc_src_port','dc_dest_ip','dc_dest_port','src_pkts_mean',
                                               'src_pkts_med', 'src_pkts_std', 'dest_pkts_mean', 'dest_pkts_med', 
                                               'dest_pkts_std', 'src_pkts_mean', 'src_pkts_med', 'src_pkts_std','src_bytes_mean',
                                               'src_bytes_med', 'src_bytes_std', 'duration_mean', 'duration_med',
                                                'duration_std', 'duration_count']
df_s[col] = scaler.fit_transform(df_s[col])
df_s.shape

In [None]:
# check
df_s.isna().sum()

In [None]:
pca = PCA(n_components=2)
pred_variables = df_s.loc[:, df_s.columns != 'label']
resp_variables = df_s.loc[:, df_s.columns == 'label']
X = pred_variables
X_r = pca.fit_transform(X)
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

In [None]:
#combining PCs and response variable
X_c = pd.DataFrame(data=X_r)
X_c.head()

In [None]:
X_c['label'] = resp_variables.iloc[:,0].values
X_c.columns = ['PC1', 'PC2',"label"]
X_c.head()

In [None]:
#Plot
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0,1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = X_c.label == target
    ax.scatter(X_c.loc[indicesToKeep, 'PC1']
               , X_c.loc[indicesToKeep, 'PC2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()