In [None]:
import pandas as pd
import pickle as pkl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report as cr,confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from pyod.models.iforest import IForest
le = LabelEncoder()
scale = MinMaxScaler()

In [None]:
action_type_feats = pkl.load(open('./action_type_featuers.pkl', 'rb'))
service_type_feats = pkl.load(open('./target_type_featuers.pkl', 'rb'))
combined_feats = list(set(action_type_feats + service_type_feats))

### Load the data from the 5 users considered

In [None]:
user1 = pd.read_csv('./first_user_master.csv', usecols = combined_feat )
user2 = pd.read_csv('./second_user_master.csv', usecols = combined_feat)
user3 = pd.read_csv('./third_user_master.csv', usecols = combined_feat)
user4 = pd.read_csv('./fourth_user_master.csv', usecols = combined_feat)
user5 = pd.read_csv('./fifth_User_master.csv', usecols = combined_feat)
data = pd.concat([user1, user2, user3, user4, user5])

# Drop the userid and the ground truth variables
action_drop = list(set(service_type_feats + ['userid','actiontype']))
service_drop = list(set(action_type_feats + ['userid', 'service_type']))

### Results for predicting malicious behaviour

In [None]:
results = []
normalize = 'Yes'
sampling = "No"
user = 'MultiUser'
table_columns = pkl.load(open('./table_columns.pkl', 'rb'))
classifiers = {'ExtraTrees': ExtraTreesClassifier(),
               'RandomForest': RandomForestClassifier(),
               'XGBClassifier':XGBClassifier(),
               'DecisionTreeClassifier': DecisionTreeClassifier(),
               'IForest': IForest(n_estimators=100,max_samples='auto',verbose=2),
              }

X = data.drop(action_drop, axis = 1)
y = data['actiontype']
X = scale.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)


### Training and testing different models

In [None]:
for clf in classifiers:
    
    curr_clas = classifiers[clf]
    print('User', user, 'Normalize ' , normalize , 'Classifier ', clf, 'sampling', sampling)
    curr_clas.fit(X_train, y_train)
    y_predict = curr_clas.predict(X_test)
    
    report = cr(y_test, y_predict, target_names=['benign','malicious'], output_dict=True)
    confusion_mat = confusion_matrix(y_test, y_predict)
    auc_score = roc_auc_score(y_test, y_predict)

    results.append([clf,
                    user,
                    user,
                    normalize,
                    sampling,
                    report['benign']['precision'],
                    report['benign']['recall'],
                    report['benign']['f1-score'],
                    report['benign']['support'],
                    report['malicious']['precision'],
                    report['malicious']['recall'],
                    report['malicious']['f1-score'],
                    report['malicious']['support'],
                    report['macro avg']['precision'],
                    report['macro avg']['recall'],
                    report['macro avg']['f1-score'],
                    report['macro avg']['support'],
                    report['weighted avg']['precision'],
                    report['weighted avg']['recall'],
                    report['weighted avg']['f1-score'],
                    report['weighted avg']['support'],
                    report['accuracy'],
                    confusion_mat[0][0],
                    confusion_mat[0][1],
                    confusion_mat[1][0],
                    confusion_mat[1][1],
                    auc_score,
                    X_train.shape])

actiontype_results  = pd.DataFrame(results, columns = table_columns)
actiontype_results.to_csv("./actiontype_results.csv")

### Pipelining the results from the best model for predicting malware for target classification

In [None]:
et = ExtraTreesClassifier()
X = data.drop(['actiontype'], axis = 1)
y = data['actiontype']
X_train, X_test, y_train, y_test = train_test_split(X, y)
et.fit(X_train.drop(action_drop, axis = 1), y_train)
y_predict = et.predict(X_test.drop(action_drop, axis = 1))

In [None]:
X_test['predicted_actiontype'] = y_predict
X_test['actual_actiontype'] = y_test 
X_test.reset_index(inplace=  True, drop =True)

In [None]:
# data required for target classification
service_type_data = X_test[X_test['predicted_actiontype'] == 1]
target_classification_data = service_type_data.drop(service_drop, axis = 1)
target_classification_data.drop(['actual_actiontype','predicted_actiontype'], axis = 1, inplace = True)

### Training and testing on different models for target classification

In [None]:
results = []
user ="Multiuser"
normalize = "Yes"
service_table_columns = pkl.load(open('./service_table_columns.pkl', 'rb'))
classifiers = {'ExtraTrees': ExtraTreesClassifier(n_estimators=13),
               'GradientBoosting': GradientBoostingClassifier(),
               'XGBClassifier':XGBClassifier(random_state=8055),
               'KNeighborsClassifier':KNeighborsClassifier(n_neighbors=3),              
              }
sampler =  {'RandomOverSampler':RandomOverSampler(),
            'RandomUnderSamplerder':RandomUnderSampler()}

X = target_classification_data.drop(['service_type'], axis = 1)
y = target_classification_data["service_type"]
X = scale.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 34123, test_size=0.25)    

for clf in classifiers:
    curr_clas = classifiers[clf]
    for sampling in sampler:
        curr_sampler = sampler[sampling]
        X_train_sample, y_train_sample = curr_sampler.fit_sample(X_train, y_train)
        curr_clas.fit(X_train_sample, y_train_sample)
        y_predict = curr_clas.predict(X_test)
        report = cr(y_test, y_predict, output_dict=True)
        confusion_mat = confusion_matrix(y_test, y_predict)
        print("==================",clf,"================",sampling)
        print(cr(y_test, y_predict))
        print(confusion_matrix(y_test, y_predict))
        results.append([clf, user ,normalize,sampling,report, confusion_mat])
        
service_type_results  = pd.DataFrame(results, columns = service_table_columns)
service_type_results.to_csv("./service_type_results.csv")

### Results for different test sizes

In [None]:
user_list = list(set(df['userid']))
results = []
normalize = 'Yes'
sampling = "No"
classifiers = {'ExtraTrees': ExtraTreesClassifier()}
tsizes_table_columns = pkl.load(open('./tsizes_table_columns.pkl', 'rb'))

for i in range(1, len(user_list) + 1):    
    data_testSizes = data[data['userid'].isin(user_list[:i])]
    X = data_testSizes[action_type_feats]
    y = data_testSizes['actiontype']
    X = scale.fit_transform(X)
    tsize = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    for clf in classifiers:
        for size in tsize:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size )
            curr_clas = classifiers[clf]
            
            print('Users', user_list[:i], 'Normalize ', normalize , 'Classifier ', clf, 'TestSize', size, 'Df shape', data_testSizes.shape)
            curr_clas.fit(X_train, y_train)
            y_predict = curr_clas.predict(X_test)
            
            report = cr(y_test, y_predict, target_names=['benign','malicious'], output_dict=True)
            confusion_mat = confusion_matrix(y_test, y_predict)
            auc_score = roc_auc_score(y_test, y_predict)

            results.append([clf,
                            user_list[:i],
                            i,
                            size,
                            normalize,
                            sampling,
                            report['benign']['precision'],
                            report['benign']['recall'],
                            report['benign']['f1-score'],
                            report['benign']['support'],
                            report['malicious']['precision'],
                            report['malicious']['recall'],
                            report['malicious']['f1-score'],
                            report['malicious']['support'],
                            report['macro avg']['precision'],
                            report['macro avg']['recall'],
                            report['macro avg']['f1-score'],
                            report['macro avg']['support'],
                            report['weighted avg']['precision'],
                            report['weighted avg']['recall'],
                            report['weighted avg']['f1-score'],
                            report['weighted avg']['support'],
                            report['accuracy'],
                            confusion_mat[0][0],
                            confusion_mat[0][1],
                            confusion_mat[1][0],
                            confusion_mat[1][1],
                            auc_score, X_train.shape])

varying_test_size_results  = pd.DataFrame(results, columns = tsizes_table_columns)
varying_test_size_results['FOR'] = varying_test_size_results['FalseNegative'] / (varying_test_size_results['FalseNegative'] + varying_test_size_results['TrueNegative'])
varying_test_size_results['FPR'] = varying_test_size_results['FalsePositive'] / (varying_test_size_results['FalsePositive'] + varying_test_size_results['TrueNegative'])
varying_test_size_results.to_csv('./varying_test_size_results.csv')