### Loading libraries

In [20]:
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score

### Loading data

In [2]:
dataframe = pd.read_csv("data/new_data/old_records.csv.gz")

### Checking null values

In [3]:
data_null_values = dataframe[dataframe.isna().any(axis=1)]
len_null = len(data_null_values)
len_data = len(dataframe)

# Print message
print("Validity of the data:")
print(f"{len_null} out of {len_data} contains at least one null value", end=" ")
print(f"representing {len_null / len_data} % of the instances")

cleaned_data = dataframe.dropna(how='any', axis=0)
float_columns = cleaned_data.select_dtypes(include=['float64'])

for col in float_columns.columns.values:
    cleaned_data[col] = cleaned_data[col].astype('int64')

filtered_data = cleaned_data.drop(['id', 'v1', 'v2'], axis=1)

Validity of the data:
0 out of 66137 contains at least one null value representing 0.0 % of the instances


### Correlation analysis (Changed)

In [4]:
correlation_matrix = filtered_data.corr(method="spearman")
unstacked_correlation_matrix = correlation_matrix.unstack()
sorted_values = unstacked_correlation_matrix.sort_values(kind="quicksort")

# Most correlated features excluding comparison between the same features
print()
print("Most correlated features:")
print(sorted_values[1630:1640])


Most correlated features:
TaskAddition           BlockAddition            0.708950
BlockAddition          TaskAddition             0.708950
                       TaskFileAddition         0.721393
TaskFileAddition       BlockAddition            0.721393
TaskRelocation         TaskAddition             0.748993
TaskAddition           TaskRelocation           0.748993
HandlerTaskRelocation  HandlerBlockRemoval      0.764927
HandlerBlockRemoval    HandlerTaskRelocation    0.764927
HandlerBlockAddition   HandlerTaskAddition      0.824182
HandlerTaskAddition    HandlerBlockAddition     0.824182
dtype: float64


### Droping the second correlated feature

In [5]:
# Droping second correlated feature
filtered_data = filtered_data.drop(['HandlerBlockAddition'], axis=1)

### Transforming data into Numpy arrays

In [6]:
to_transform = {"release": {"patch": 0, "minor": 1, "major": 2}}
filtered_data.replace(to_transform, inplace=True)

X = filtered_data[filtered_data.columns[:-1]].to_numpy()
y = filtered_data[filtered_data.columns[-1]].to_numpy()

# Print shapes
print()
print("Shapes of the data:")
print(X.shape)
print(y.shape)


Shapes of the data:
(66137, 40)
(66137,)


### Normalisation for the model that's going to select the features

In [7]:
from sklearn.preprocessing import RobustScaler

transformer = RobustScaler()
X_normal = transformer.fit_transform(X)

### Configuring classifier

In [8]:
clf = RandomForestClassifier(n_jobs=8)

### All feature importance weights by the Random Forest model

In [9]:
clf2 = RandomForestClassifier(n_jobs=8)
feature_importances = np.zeros(X_normal.shape[1])

for i, (train, test) in enumerate(StratifiedKFold(n_splits=10).split(X_normal, y)):
    clf2.fit(X_normal[train], y[train])
    feature_importances += clf2.feature_importances_

print("Weights of the features:")
feature_importances /= 10

for feature_name, feature_weight in zip(filtered_data.columns[:-1], feature_importances):
    print(feature_name, "-->", feature_weight)

Weights of the features:
BlockAddition --> 0.039648374943447647
BlockEdit --> 0.006993247683843925
BlockRelocation --> 0.007363420330569246
BlockRemoval --> 0.023507362073561182
RoleVariableAddition --> 0.0447254176815704
RoleVariableEdit --> 0.0340306215259378
RoleVariableRelocation --> 0.0017671864240601885
RoleVariableRemoval --> 0.020916053550904756
RoleVarFileAddition --> 0.01635927918633266
RoleVarFileRelocation --> 0.003931998623383815
RoleVarFileRemoval --> 0.008080246554922333
DefaultVariableAddition --> 0.11958434658399393
DefaultVariableEdit --> 0.048997465538168694
DefaultVariableRelocation --> 0.0002564869009449184
DefaultVariableRemoval --> 0.04283327016715107
DefaultVarFileAddition --> 0.005643632810421509
DefaultVarFileRelocation --> 0.0003889098762036113
DefaultVarFileRemoval --> 0.0018192655469374617
DependencyAddition --> 0.008862047612200212
DependencyRemoval --> 0.007278095081349463
HandlerBlockEdit --> 1.0455925667852486e-05
HandlerBlockRelocation --> 1.2777487477

### Feature importances by feature permutation

In [10]:
from sklearn.inspection import permutation_importance

print("Fitting model ...")
clf3 = RandomForestClassifier(n_jobs=8)
clf3.fit(X_normal, y)
print("Done!")

print("Calculating permutation importances ...")
result = permutation_importance(clf3, X_normal, y, n_repeats=10, random_state=0)
print("Done!")

Fitting model ...
Done!
Calculating permutation importances ...
Done!


In [11]:
for feature_name, feature_importance in zip(filtered_data.columns[:-1], result.importances_mean):
    print(feature_name, "-->", feature_importance)

BlockAddition --> 0.008678954291848784
BlockEdit --> 0.000952568153983413
BlockRelocation --> 0.0007076220572448166
BlockRemoval --> 0.0044604381813508785
RoleVariableAddition --> 0.020891482831093055
RoleVariableEdit --> 0.011213087984033188
RoleVariableRelocation --> 6.048051771323504e-05
RoleVariableRemoval --> 0.0038616810559898517
RoleVarFileAddition --> 0.0035910307392231335
RoleVarFileRelocation --> 0.00027821038148089227
RoleVarFileRemoval --> 0.000721230173730314
DefaultVariableAddition --> 0.07402210562922422
DefaultVariableEdit --> 0.025757140481122532
DefaultVariableRelocation --> 0.0
DefaultVariableRemoval --> 0.02261971362474865
DefaultVarFileAddition --> 0.0014197801533181397
DefaultVarFileRelocation --> 7.560064714151604e-05
DefaultVarFileRemoval --> 0.00022831395436748725
DependencyAddition --> 0.0023436200613877302
DependencyRemoval --> 0.0014787486580885357
HandlerBlockEdit --> 0.0
HandlerBlockRelocation --> 0.0
HandlerBlockRemoval --> 0.0004415077793065936
HandlerTa

### Feature selection

In [19]:
rfecv = RFECV(estimator=clf, 
              cv=StratifiedKFold(n_splits=10), 
              scoring='f1_macro', 
              n_jobs=-1)

X_new = rfecv.fit_transform(X_normal, y)

print()
print("Optimal number of features : %d" % rfecv.n_features_)

column_names = filtered_data.columns[:-1].tolist()
selected_features = list()

for i, value in enumerate(rfecv.support_):
    if value:
        selected_features.append(column_names[i])

print()
print("Original Features on Data: ", column_names)
print("Ranking of Features: ", rfecv.ranking_)
print("Selected Features: ", selected_features)

discarded_features = list(set(column_names).difference(set(selected_features)))
print("Discarded Features: ", discarded_features)


Optimal number of features : 28

Original Features on Data:  ['BlockAddition', 'BlockEdit', 'BlockRelocation', 'BlockRemoval', 'RoleVariableAddition', 'RoleVariableEdit', 'RoleVariableRelocation', 'RoleVariableRemoval', 'RoleVarFileAddition', 'RoleVarFileRelocation', 'RoleVarFileRemoval', 'DefaultVariableAddition', 'DefaultVariableEdit', 'DefaultVariableRelocation', 'DefaultVariableRemoval', 'DefaultVarFileAddition', 'DefaultVarFileRelocation', 'DefaultVarFileRemoval', 'DependencyAddition', 'DependencyRemoval', 'HandlerBlockEdit', 'HandlerBlockRelocation', 'HandlerBlockRemoval', 'HandlerTaskAddition', 'HandlerTaskEdit', 'HandlerTaskRelocation', 'HandlerTaskRemoval', 'HandlerFileAddition', 'HandlerFileRelocation', 'HandlerFileRemoval', 'MetaEdit', 'PlatformAddition', 'PlatformRemoval', 'TaskAddition', 'TaskEdit', 'TaskRelocation', 'TaskRemoval', 'TaskFileAddition', 'TaskFileRelocation', 'TaskFileRemoval']
Ranking of Features:  [ 1  1  1  1  1  1  7  1  1  2  1  1  1 10  1  1  9  8  1  

In [91]:
for feature_name, ranking in zip(column_names, rfecv.ranking_):
    print(feature_name, ranking)

BlockAddition 1
BlockEdit 1
BlockRelocation 1
BlockRemoval 1
RoleVariableAddition 1
RoleVariableEdit 1
RoleVariableRelocation 8
RoleVariableRemoval 1
RoleVarFileAddition 1
RoleVarFileRelocation 2
RoleVarFileRemoval 1
DefaultVariableAddition 1
DefaultVariableEdit 1
DefaultVariableRelocation 10
DefaultVariableRemoval 1
DefaultVarFileAddition 1
DefaultVarFileRelocation 9
DefaultVarFileRemoval 7
DependencyAddition 1
DependencyRemoval 1
HandlerBlockEdit 12
HandlerBlockRelocation 11
HandlerBlockRemoval 1
HandlerTaskAddition 1
HandlerTaskEdit 1
HandlerTaskRelocation 1
HandlerTaskRemoval 5
HandlerFileAddition 3
HandlerFileRelocation 13
HandlerFileRemoval 6
MetaEdit 1
PlatformAddition 1
PlatformRemoval 1
TaskAddition 1
TaskEdit 1
TaskRelocation 1
TaskRemoval 1
TaskFileAddition 1
TaskFileRelocation 4
TaskFileRemoval 1


### Normalisation of the input with the features already selected

In [94]:
from sklearn.preprocessing import RobustScaler, StandardScaler

transformer = RobustScaler()
X_normal = transformer.fit_transform(X_new)

### Stratified Training

In [96]:
cv = StratifiedKFold(n_splits=10)

X = X_normal

precision_scores = list()
recall_scores = list()
confusion_matrices = list()

for i, (train, test) in enumerate(cv.split(X, y)):
    clf.fit(X[train], y[train])
    y_predict = clf.predict(X[test])

    precision_value = precision_score(y[test], y_predict, average="macro")
    recall_value = recall_score(y[test], y_predict, average="macro")
    conf_matrix = confusion_matrix(y[test], y_predict)

    precision_scores.append(precision_value)
    recall_scores.append(recall_value)
    confusion_matrices.append(conf_matrix)

print()
print("Mean of Precisions: ", np.mean(precision_scores))
print("Mean of Recalls: ", np.mean(recall_scores))

print("Confusion matrix:")
print(sum(confusion_matrices))


Mean of Precisions:  0.7289960910830076
Mean of Recalls:  0.5653869618309398
Confusion matrix:
[[44897  2867   209]
 [ 8463  6335   207]
 [ 1517   574  1068]]


In [24]:
cv = StratifiedKFold(n_splits=10)

X = X_normal

precision_scores = list()
recall_scores = list()
f1_scores = list()
confusion_matrices = list()

for i, (train, test) in enumerate(cv.split(X, y)):
    clf.fit(X[train], y[train])
    y_predict = clf.predict(X[test])

    precision_value = precision_score(y[test], y_predict, average="macro")
    recall_value = recall_score(y[test], y_predict, average="macro")
    f1_value = f1_score(y[test], y_predict, average="macro")
    conf_matrix = confusion_matrix(y[test], y_predict)

    precision_scores.append(precision_value)
    recall_scores.append(recall_value)
    f1_scores.append(f1_value)
    confusion_matrices.append(conf_matrix)

print()
print("Mean of Precisions: ", np.mean(precision_scores))
print("Mean of Recalls: ", np.mean(recall_scores))
print("Mean of F1-Scores: ", np.mean(f1_scores))

print("Confusion matrix:")
print(sum(confusion_matrices))


Mean of Precisions:  0.7306407673441868
Mean of Recalls:  0.5641582810197747
Mean of F1-Scores:  0.610542723443403
Confusion matrix:
[[44789  2968   216]
 [ 8471  6361   173]
 [ 1519   582  1058]]


### Saving

In [98]:
# Model
pickle.dump(clf, open("data/models/randomForest_old_no_balance.pickle", "wb")) # Model for the old data

# Data
pickle.dump(X, open("data/transformed_data/X_train_old.pickle", "wb"))
pickle.dump(y, open("data/transformed_data/y_train_old.pickle", "wb"))