In [None]:
import seaborn as sns
import numpy as np 
import pandas as pd
import os
from matplotlib import pyplot as plt
from IPython.display import display, Markdown, HTML
from collections import Counter, defaultdict
from joblib import Memory
from shutil import rmtree

# Preprocessing modules
from scipy.stats import kstest, shapiro, normaltest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler
from sklearn.model_selection import KFold, cross_validate, train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel

# Classifiers
from sklearn.naive_bayes import GaussianNB as NaiveBayes
from sklearn.svm import SVC as SVM
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier as AdaBoost


# Metrics
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')
                        
# Auxiliary display functions
show = lambda _df: display(HTML(_df.to_html()))
shape = lambda _df: print(f"Number of features: {_df.shape[1]}\nNumber of examples: {_df.shape[0]}")

In [None]:
input_file = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        input_file.append(os.path.join(dirname, filename))
input_file = input_file[0]

In [None]:
df = pd.read_csv(input_file, sep='|')
features = df.columns
shape(df)

## Dataset overview

In [None]:
# Check DataFrame for invalid values
show(pd.DataFrame.from_dict(\
                            {'Valids': df.notna().sum().sum(),\
                            'Invalids': df.isna().sum().sum()}, \
                            orient='index', \
                            columns=['values']))

In [None]:
# The dataset does not contain invalid values, lets check what data types it contains
show(pd.DataFrame.from_dict(\
                            Counter(df.dtypes.values),\
                            orient='index', \
                            columns=['features']))

In [None]:
# Checking Class Balancing
sns.set(font_scale=2)
fig = plt.figure(figsize=(15,2))
sns.countplot(y='legitimate', data=df);
class_counts = df.legitimate.value_counts()
class_proportions = ((df.legitimate.value_counts()/len(df))*100).round(2)

show(pd.DataFrame.from_dict(\
                            {'Total number of examples': f"{class_counts[0] + class_counts[1]}",\
                             'Malware (0)': f"{class_counts[0]} ({class_proportions[0]}%)",\
                             'Legitimate (1)': f"{class_counts[1]} ({class_proportions[1]}%)"},\
                            orient='index', columns=['']))

In [None]:
# All but 2 features contains only numeric values. Lets check those.
df[df.select_dtypes(include='object').columns].sample(n=5)

In [None]:
# Both columns contain strings
# The 'md5' feature brings no important information for our analysis.
# Malware examples does not inform file extensions, therefore the feature 'Name' may also be discarded
df.drop(['Name', 'md5'], axis=1, inplace=True)
shape(df)

## Data distribution

### Normality tests

* H0 (null hypothesis): the data follow a normal distribution
* H1: the data do not follow a normal distribution


- $\alpha = 0.05$
- p-value $\leq \alpha$: reject H0 (data is not normally distributed)
- p-value $> \alpha$: do not reject H0 (assume that the sampling distribution of the mean is normal)

In [None]:
# Auxiliary functions
H1 = "Reject H0"
H0 = "Do not Reject H0"
def shapiro_wilk(values: pd.Series, alpha=0.05):
    _, p = shapiro(values)
    
    return 0 if p <= alpha else 1

def k_squared(values: pd.Series, alpha=0.05):
    _, p = normaltest(values)
    
    return 0 if p <= alpha else 1

def kolmogorov_smirnov(values: pd.Series, mu: np.float64, sd: np.float64, alpha=0.05):
    _, p = kstest(values, 'norm', args=(mu, sd))
    
    return 0 if p <= alpha else 1

# Majority decision function
def test_result(tests: list):
    val = sum(tests)
    
    return H1 if val < 2 else H0

def is_normal(data: pd.DataFrame, description=None, dtypes=['float64','int64'], alpha=0.05):
    if not isinstance(description, pd.DataFrame):
        description = data.describe(include=dtypes)
    
    normal_features = []
    result = []
    for feature in list(data.columns):
        series = data[feature]
        mean = description[feature]['mean']
        sd = description[feature]['std']
        
        tests = [shapiro_wilk(series),\
                 k_squared(series),\
                 kolmogorov_smirnov(series, mean, sd)]
        
        tres = test_result(tests)
        if tres == H0:
            normal_features.append(feature)
            
        result.append(tres)
        
    summary = dict(Counter(result))
    
    if normal_features:
        print(f"Gaussian: {', '.join(normal_features)}")
        
    return pd.DataFrame.from_dict(summary, orient='index', columns=['features'])

def drop_and_rebuild_class_dfs(data, to_drop, index=False):
    if index:
        new_data = data.drop(index=to_drop)
    else:
        new_data = data.drop(columns=to_drop)
    legit = new_data[new_data.legitimate == 1].drop(['legitimate'], axis=1)
    mal = new_data[new_data.legitimate == 0].drop(['legitimate'], axis=1)
    shape(df)
    return new_data, legit, mal

In [None]:
# Check if legitimate data has normally distributed features
legit_df = df[df.legitimate == 1].drop(['legitimate'], axis=1)
show(is_normal(legit_df)) 

In [None]:
# Check if malware data has normally distributed features
mal_df = df[df.legitimate == 0].drop(['legitimate'], axis=1)
show(is_normal(mal_df)) 

In [None]:
# LoaderFlags and NumberOfRvaAndSizes presented distinct distributions in Legitimate and Malware data.
# Let's take a closer look at these features.
selected_features = ['LoaderFlags', 'NumberOfRvaAndSizes']
show(pd.concat(\
                [round(legit_df[selected_features].describe(),2),\
                 round(mal_df[selected_features].describe(),2)],\
               axis=1, keys=['Legitimate Statistics', 'Malware Statistics']))

In [None]:
# The observed difference seems to be caused by outliers in Malware data.
# Lets plot the distributions.
legit_df[selected_features].boxplot(figsize=(10,5))
plt.tight_layout()
plt.show()

In [None]:
mal_df[selected_features].boxplot(figsize=(10,5))
plt.tight_layout()
plt.show()

In [None]:
# Since, with the removal of outliers, these features will have zero standard deviation, 
# they will not add useful information to our model. We can then safely remove them.
df, legit_df, mal_df = drop_and_rebuild_class_dfs(df, selected_features)

In [None]:
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
plt.rc('font', size=SMALL_SIZE)         
plt.rc('axes', titlesize=SMALL_SIZE)    
plt.rc('axes', labelsize=SMALL_SIZE)   
plt.rc('xtick', labelsize=SMALL_SIZE)   
plt.rc('ytick', labelsize=SMALL_SIZE) 

In [None]:
# Let's check if we have more constants
legit_df.hist(figsize=(15,10), alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Apparently there are other constants, let's check.
def check_constants(data):
    description = data.describe()
    cols = data.columns
    result = set()
    for col in cols:
        if description[col]['75%'] == description[col]['min']:
            result.add(col)
    return result

def compare_dfs(data1, data2, name1='legit', name2='malware'):
    data1_const_features = check_constants(data1)
    data2_const_features = check_constants(data2)
    union = list(data1_const_features.union(data2_const_features))
    show(pd.DataFrame.from_dict(\
                                {name1:[len(data1_const_features)],\
                                name2:[len(data2_const_features)],\
                                'Union': [len(union)]},\
                               orient='index', columns=['constant_features']))
    
    return data1_const_features, data2_const_features, union

In [None]:
legit_const, mal_const, constants = compare_dfs(legit_df, mal_df)

In [None]:
# Lets take a deeper look at these constant features
show(pd.concat(\
                [round(legit_df[constants].describe(),2),\
                 round(mal_df[constants].describe(),2)],\
               axis=1, keys=['Legitimate Statistics', 'Malware Statistics']).T)

In [None]:
# Since these constants do not add information and can bias the model, we chose to remove them.
df.drop(columns=constants, inplace=True)
legit_df = df[df.legitimate == 1].drop(['legitimate'], axis=1)
mal_df = df[df.legitimate == 0].drop(['legitimate'], axis=1)
shape(df)

## Finding outliers

In [None]:
# Auxiliary functions
def iqr_score(data: pd.DataFrame):
    description = data.describe()
    iqr_values = defaultdict(lambda: defaultdict(np.float64))
    
    for feature in description.columns:
        q1 = description[feature]['25%']
        q3 = description[feature]['75%']
        iqr = q3 - q1
        factor = 1.5 * iqr
        iqr_values[feature]['inf'] = q1 - factor
        iqr_values[feature]['sup'] = q3 + factor
    
    return iqr_values

def find_outliers(data: pd.DataFrame):
    iqr_score_limits = iqr_score(data)
    total_instances = data.shape[0]
    result = []
    outliers_index = defaultdict(list)
    
    for feature in data.columns:
        inf = iqr_score_limits[feature]['inf']
        sup = iqr_score_limits[feature]['sup']
        outliers = data[(data[feature] < inf)|(data[feature] > sup)].index
        proportion = (len(outliers)/total_instances)
        result.append(proportion)
        for ix in outliers:
            outliers_index[ix].append(feature)
        
    return np.array(result), instance_outlier_proportion(data,outliers_index)

def instance_outlier_proportion(data, outliers_index):
    aux = data.copy()
    aux['outlier_proportion'] = 0.0
    features = data.columns
    outlier_features_by_index = defaultdict(list)
    for ix in outliers_index:
        prop = len(outliers_index[ix])/len(features)
        aux.loc[ix, 'outlier_proportion'] = prop
    
    return aux

def select_by_outlier_treshold(data, treshold, get_index=False, get_examples=False, simplify=False):
    if not 'outlier_proportion' in data.columns:
        return
    selection = data[data.outlier_proportion >= treshold]
    if get_examples:
        return selection.sort_values('outlier_proportion', ascending=False)
    if get_index:
        return selection.index.tolist()
    
    abs_val = selection.shape[0]
    prop = round(abs_val/data.shape[0], 4)
    
    if not simplify:
        show(data.outlier_proportion.describe().to_frame())
    print(f'{abs_val} ({prop}%) examples contain at least {treshold*100}% of feature outliers\n')

def outliers_summary(outliers: np.array, instances: int):
    total_points = int(outliers.sum() * instances)
    total_proportion = outliers.mean() * 100
    max_prop = outliers.max() * 100
    min_prop = outliers.min() * 100
    median_prop = np.median(outliers) * 100
    

    display(HTML(pd.DataFrame.from_dict({'Outliers': total_points,
    'Outlier Proportion': f"{round(total_proportion, 2)}%",
    'Min Outlier Proportion': f"{round(min_prop,2)}%",
    'Median': f"{round(median_prop,2)}%",
    'Max Outlier Proportion': f"{round(max_prop,2)}%"}, orient='index', columns=['']).to_html()))

def outlier_stats(data, name='', rate=0.5, simplify=False):
    out_result, out_rate = find_outliers(data)
    if not simplify:
        print(f'{name} data outliers overview')
        outliers_summary(out_result, data.shape[0])
    
    print(f'{name} examples outlier stats')
    select_by_outlier_treshold(out_rate, rate, get_index=False, get_examples=False, simplify=simplify)

def get_outlier_indexes(data, rate=0.5):
    _, out_rate = find_outliers(data)
    return select_by_outlier_treshold(out_rate, rate, get_index=True)

In [None]:
# Lets try to identify the outliers using the Interquartile Distance (IQR) metric.
outlier_stats(legit_df, name='Legit')
outlier_stats(mal_df, name='Malware')

In [None]:
# The number of lines containing more than 50% of outliers is small for both classes. 
# Removing them will help us standardize the data and balance the classes.
outlier_indexes = get_outlier_indexes(legit_df, 0.5) + get_outlier_indexes(mal_df, 0.5)
df, legit_df, mal_df = drop_and_rebuild_class_dfs(df, outlier_indexes, index=True)

In [None]:
# Let's verify the results
outlier_stats(legit_df, name='Legit', simplify=True)
outlier_stats(mal_df, name='Malware', simplify=True)

# Model evaluation

## Data Scaling

In [None]:
# Globals
scale_results_dict = defaultdict(lambda: defaultdict(np.float64))
scale_frequency = defaultdict(lambda:defaultdict(lambda: Counter()))
reduce_dim_results = {'classifier':[], 'k':[], 'accuracy': []}

In [None]:
# Constants
label = 'legitimate'
y = df[label]
X = df.drop(columns=label)

K_FOLD = KFold(n_splits=5, shuffle=True, random_state=1)
CLASSIFIERS = [(NaiveBayes(), ''),\
              (SVM(), '(kernel=rbf)'),\
              (AdaBoost(), '(n_estimators=50)'),\
              (ExtraTreesClassifier(), '(n_estimators=100)'),\
              (GradientBoosting(), '(n_estimators=100, learning_rate=0.1)'),\
              (RandomForest(), '(n_estimators=100)')]

METRICS = ['accuracy','precision', 'recall', 'f1', 'roc_curve', 'roc_auc']

In [None]:
def cross_validation(X_train, y_train, classifier, cv=K_FOLD, _metrics=METRICS):

    return cross_validate(classifier,
                            X_train,
                            y_train,
                            scoring=_metrics,
                            return_train_score=True,
                            cv=cv,
                            n_jobs=-1)

def plot_scaler_comparison(classifier: str, scaler_df: pd.DataFrame, params=''):
    scaler_df = scaler_df.sort_values(by=list(scaler_df.index), axis=1)
    scaler_df.plot.bar(figsize=(7,5),title=f"{classifier} {params}", rot=0, fontsize=12)
    plt.legend(loc='upper left', bbox_to_anchor=(1,1), fancybox=True, framealpha=0.2)
    plt.ylabel('Valor')
    plt.grid(b=True, which='major', color='#666666', linestyle='-', axis='y')
    plt.minorticks_on()
    plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.6, axis='y')
    plt.show();

def feed_results_dict(val: np.float64, acc_type: str, method: str):
    last = scale_results_dict[method][acc_type]
    scale_results_dict[method][acc_type] = val if not last else (val + last)/2
    
# Test models accuracy on each scaling method
def test_scalers(X_train, y_train, classifier, cv=K_FOLD, _metrics='accuracy', params=None, last=False):
    method = repr(classifier).split('(')[0]
    scalers = [None, StandardScaler(), MaxAbsScaler(), RobustScaler()]
    test_acc = 'test_accuracy'
    train_acc = 'train_accuracy'
    
    res = {train_acc:[], test_acc:[]}
    cols =[]
    
    method_test_acc = scale_results_dict[method]['test_accuracy']
    method_train_acc = scale_results_dict[method]['train_accuracy']
    
    for scaler in scalers:
        if not scaler:
            cross_val = cross_validation(X_train, y_train, classifier, cv.split(X_train), _metrics)
        else:
            pipeline = Pipeline(steps=[('scaler', scaler), 
                                       ('method', classifier)])

            cross_val = cross_validate(pipeline,
                                        X_train,
                                        y_train,
                                        scoring=_metrics,
                                        return_train_score=True,
                                        cv=cv,
                                        n_jobs=-1)
            
        cols.append(repr(scaler).split('(')[0])
        
        for k in res:
            res[k].append(cross_val[k.replace('accuracy', 'score')].mean())
            
    # Create results DataFrame
    res_df = pd.DataFrame.from_dict(res, orient='index', columns=cols)
    
    best_train = res_df.loc[train_acc].sort_values()
    feed_results_dict(best_train[-1], train_acc, method)
    best_train = best_train[best_train==best_train[-1]].index.tolist()
    scale_frequency[method][train_acc].update(best_train)
    best_train = ', '.join(best_train)
    
    best_test = res_df.loc[test_acc].sort_values()
    feed_results_dict(best_test[-1], test_acc, method)
    best_test = best_test[best_test==best_test[-1]].index.tolist()
    scale_frequency[method][test_acc].update(best_test)
    best_test = ', '.join(best_test)
    
    # Presents the results
    plot_scaler_comparison(method, res_df, params)
    display(HTML(f"<center>{res_df.to_html()}"))
    display(Markdown(f"<center><p>Method: <strong>{method}</strong><br />Best on training: <strong>{best_train}</strong><br />Best on testing: <strong>{best_test}</strong>"))
    display(Markdown('---'))
    
    if last:
        display(Markdown("<strong>Best results:</strong><br />"))
        show(pd.DataFrame.from_dict(scale_results_dict))
        show(pd.DataFrame.from_dict(scale_frequency))

# Evaluate different Scalers
def execute_test_scalers():
    
    for ix, classifier in enumerate(CLASSIFIERS):
        method = classifier[0]
        params = classifier[1]
        
        if ix < (len(CLASSIFIERS) - 1):
            test_scalers(X, y, method, params=params)
        else:
            test_scalers(X, y, method, params=params, last=True)

In [None]:
execute_test_scalers()

## Dimensionality reduction

### Using SelectKBest

In [None]:
def execute_test_dimensionality_reduction():
    
    # Best Scaling results
    best_scaling =[(NaiveBayes(), None),
                   (SVM(), StandardScaler()),
                  (AdaBoost(), StandardScaler()),
                  (ExtraTreesClassifier(), None),
                  (GradientBoosting(), None),
                  (RandomForest(), MaxAbsScaler())]
    
    for classifier in best_scaling:
        method = classifier[0]
        scaler = classifier[1]
        test_dimentionality_reduction(X, y, method, scaler)

# Test the effect of dimensionality reduction on accuracy
def test_dimentionality_reduction(X_train, y_train, classifier, scaler):
    location = 'cachedir'
    memory = Memory(location=location, verbose=0)
    
    # Numer of features to test 
    n_features_to_test = np.arange(1, len(X_train.columns)+1)
    
    if scaler:
        pipe = Pipeline(memory=memory,
                        steps=[
                        ('scaler', scaler), # Scale
                        ('reduce_dim', SelectKBest()), # Reduce dimensionality
                        ('classifier', classifier) # Apply classifier
                        ])
    else:
        pipe = Pipeline(memory=memory,
                        steps=[
                        ('reduce_dim', SelectKBest()), # Reduce dimensionality
                        ('classifier', classifier) # Apply classifier
                        ])
    
    params = [
        {'reduce_dim__k': n_features_to_test}
        ]
    
    # Apply GridSearchCV
    grid = GridSearchCV(pipe, params, cv=K_FOLD, refit='accuracy', scoring='accuracy', n_jobs=-1).fit(X, y)
    best = grid.best_params_
    
    # Record the best results 
    class_name = repr(classifier).split('(')[0]
    best_k = best['reduce_dim__k']
    red_dim_method = f"SelectKBest(k={best_k})"
    best_score = grid.best_score_
    
    
    reduce_dim_results['classifier'].append(class_name)
    reduce_dim_results['k'].append(best_k)
    reduce_dim_results['accuracy'].append(best_score)
    
    param =  'param_reduce_dim__k'
    grid_df = pd.DataFrame.from_dict(grid.cv_results_)
    grid_df = grid_df[[param, 'mean_test_score' ]][grid_df[param].notna()].set_index(param)
    
    # Plot results
    ax = grid_df.plot(figsize=(10,5))
    ax.set_title(f"{class_name}, {red_dim_method} (best score: {round(best_score, 4)})", fontsize=16)
    ax.set_ylabel('accuracy', fontsize=14)
    ax.set_xlabel('features', fontsize=14)
    ax.axvline(grid_df.idxmax().values[0],color='r', linestyle='--', label='best_accuracy')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(loc='lower center', framealpha =0.4, fontsize=12)
    plt.show()
    display(Markdown('---'))
    
    # Deleta a pasta temporária
    memory.clear(warn=False)
    rmtree(location)

In [None]:
execute_test_dimensionality_reduction()

In [None]:
results = pd.DataFrame.from_dict(reduce_dim_results).set_index('classifier')
results.loc[:,'accuracy'] = round(100*results.loc[:,'accuracy'],2)
show(results)
results.plot(kind='bar', y='accuracy', rot=35,figsize=(10,7))
plt.legend(fontsize=10)
plt.ylabel('%')

### Using SelectFromModel

In [None]:
_y = df['legitimate'].values
_X = df.drop(['legitimate'], axis=1).values
new_res = defaultdict(list)

In [None]:
def selec_from_model_reduce_dim(classifier, x=_X, y=_y):
    res = []
    name = repr(classifier).split('(')[0]
    select_features = classifier.fit(x, y)
    clf = SelectFromModel(select_features, prefit=True)
    X_transformed = clf.transform(x)
    dim = X_transformed.shape[1]
    for train_ix, test_ix in K_FOLD.split(X_transformed):
        X_train, X_test = X_transformed[train_ix], X_transformed[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        classifier.fit(X_train, y_train)
        res.append(classifier.score(X_test, y_test))
        pred = classifier.predict(X_test)
        cm = confusion_matrix(y_test, pred)
    new_res['classifier'].append(name)
    new_res['k'].append(dim)
    new_res['accuracy'].append(round(100*sum(res)/len(res),2))
    new_res['false positives'].append(round(100*cm[0][1] / float(sum(cm[0])), 2))
    new_res['false negatives'].append(round(100*cm[1][0] / float(sum(cm[1])), 2))

In [None]:
for classifier, params in CLASSIFIERS[-3:]:
    selec_from_model_reduce_dim(classifier)

In [None]:
new_results = pd.DataFrame.from_dict(new_res).set_index('classifier')
show(new_results)
new_results.plot(kind='bar', y='accuracy', rot=35,figsize=(10,7))
plt.legend(fontsize=10)
plt.ylabel('%')

# Conclusion

While DataScaling proved inadequate for our data, Dimensionality Reduction was very effective. Although SelectKBest led to better accuracy results than SelectFromModel, the latter proved to be more efficient in reducing dimensionality without impairing accuracy. An interesting result that is worth highlighting is that, with SelectFromModel, the GradientBoostingClassifier reached an accuracy of 98% using only 3 features.