# Installing Scikit Learn

In [4]:
!pip3 install -U scikit-learn scipy matplotlib

Collecting scikit-learn
  Downloading scikit_learn-0.23.1-cp36-cp36m-win32.whl (5.9 MB)
Requirement already up-to-date: scipy in c:\users\nilsvn\appdata\local\programs\python\python36-32\lib\site-packages (1.4.1)
Requirement already up-to-date: matplotlib in c:\users\nilsvn\appdata\local\programs\python\python36-32\lib\site-packages (3.2.1)
Collecting joblib>=0.11
  Downloading joblib-0.15.1-py3-none-any.whl (298 kB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Could not build wheels for scipy, since package 'wheel' is not installed.
Could not build wheels for matplotlib, since package 'wheel' is not installed.
Could not build wheels for numpy, since package 'wheel' is not installed.
Could not build wheels for cycler, since package 'wheel' is not installed.
Could not build wheels for kiwisolver, since package 'wheel' is not installed.
Could not build wheels for python-dateutil, since package 'wheel' is not installed.
Could not build wheels 

You should consider upgrading via the 'c:\users\nilsvn\appdata\local\programs\python\python36-32\python.exe -m pip install --upgrade pip' command.


In [278]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
import time
from sklearn.preprocessing import StandardScaler


In [302]:
def get_df(attack_path, normal_path):
    df_attack = pd.read_csv(attack_path)
    df_normal = pd.read_csv(normal_path)
    
    df_normal = df_normal.head(5000) # comment this out if PC allows for larger dataset
    
    df_attack['Class'] = 1 #create new 'Class' column containing only 1's.
    df_normal['Class'] = 0 #create new 'Class' column containing only 0's.
    
    df_res = pd.concat([df_attack, df_normal], ignore_index=True)
    return df_res

In [303]:
def get_X_y(df):
    X = df.drop('Class', axis=1)
    y = df['Class']
    return X, y

In [305]:
def SVM(X, y, threshold=0.5): #todo: use threshold param
    X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.5) #, random_state=4
#     X_train_mod = X_train.reshape(-1,1)
#     X_test_mod = X_test.reshape(-1,1)
#     y_train_mod = y_train.reshape(-1,1)
#     y_test_mod = y_test.reshape(-1,1)
    model= svm.SVC(kernel='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred, digits=3))
    
    ACC = metrics.accuracy_score(y_test, y_pred)
    PRC = metrics.precision_score(y_test, y_pred) # when multiclass: average=None
    REC = metrics.recall_score(y_test, y_pred) # when multiclass: average=None
    return ACC, PRC, REC
    

In [306]:
def datetime_string_to_epoch(str):
    format = '%Y-%m-%d %H:%M:%S'
    if str[10] == 'T':
        format = '%Y-%m-%dT%H:%M:%S'
    t = time.strptime(str[:19], format)
    return time.mktime(t)

In [321]:
def preprocess_data(attack_path, normal_path):
    # create df from csv data
    df = get_df(attack_path, normal_path)
    # filter to keep most relevant columns
    df = df[['TimeGenerated', 'CorrelationId', 'ConditionalAccessStatus', 'Class']]
    # convert datetime strings to epoch
    df['TimeGenerated'] = df['TimeGenerated'].map(datetime_string_to_epoch)
    # split Pandas DataFrame into X (data) and y (corresponding class labels)
    X, y = get_X_y(df)
    # One-Hot Encoding for the nominal categorical variables
    X = pd.get_dummies(X[['TimeGenerated', 'CorrelationId', 'ConditionalAccessStatus']])
    # Standardization of values
    std = StandardScaler()
    X['TimeGenerated'] = std.fit_transform(X[['TimeGenerated']])
    
    return X, y

In [323]:
ATTACK_PATH = "C:/Users/NilsvN/Desktop/Research/Azure/Attacks/01_brute_force/brute_force_1.csv"
NORMAL_PATH = "C:/Users/NilsvN/Desktop/Research/Azure/30_day_login/30days_anonymized_sigin_logs_column_corrected.csv"

X, y = preprocess_data(ATTACK_PATH, NORMAL_PATH)
print(X)
ACC, PRC, REC = SVM(X, y)
print('ACC: ' + str(ACC))
print('PRC: ' + str(PRC))
print('REC: ' + str(REC))

      TimeGenerated  CorrelationId_000079bc-608c-4e04-9190-b0f75b267ac8  \
0         12.100093                                                  0    
1         12.099962                                                  0    
2         12.100184                                                  0    
3         12.100083                                                  0    
4         12.100028                                                  0    
...             ...                                                ...    
5015       1.313873                                                  0    
5016       1.313888                                                  0    
5017       1.313888                                                  0    
5018       1.313893                                                  0    
5019       1.313893                                                  0    

      CorrelationId_0061e33a-cca1-49e6-9c92-f470ad9e0ebb  \
0                                      

[[2498    0]
 [   0   12]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000      2498
           1      1.000     1.000     1.000        12

    accuracy                          1.000      2510
   macro avg      1.000     1.000     1.000      2510
weighted avg      1.000     1.000     1.000      2510

ACC: 1.0
PRC: 1.0
REC: 1.0
