### Loading libraries

In [1]:
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

### Loading data

In [2]:
dataframe = pd.read_csv("data/new_data/old_records.csv.gz")

### Checking null values

In [3]:
data_null_values = dataframe[dataframe.isna().any(axis=1)]
len_null = len(data_null_values)
len_data = len(dataframe)

# Print message
print("Validity of the data:")
print(f"{len_null} out of {len_data} contains at least one null value", end=" ")
print(f"representing {len_null / len_data} % of the instances")

filtered_data = dataframe.drop(['id', 'v1', 'v2'], axis=1)

Validity of the data:
0 out of 71722 contains at least one null value representing 0.0 % of the instances


### Correlation analysis

In [4]:
correlation_matrix = filtered_data.corr()
unstacked_correlation_matrix = correlation_matrix.unstack()
sorted_values = unstacked_correlation_matrix.sort_values(kind="quicksort")

# Most correlated features excluding comparison between the same features
print()
print("Most correlated features:")
print(sorted_values[1630:1640])


Most correlated features:
TaskEdit              TaskRelocation          0.733124
TaskRelocation        TaskEdit                0.733124
HandlerTaskAddition   HandlerBlockAddition    0.752078
HandlerBlockAddition  HandlerTaskAddition     0.752078
BlockAddition         TaskAddition            0.762167
TaskAddition          BlockAddition           0.762167
TaskFileAddition      BlockAddition           0.764727
BlockAddition         TaskFileAddition        0.764727
BlockRemoval          TaskFileRemoval         0.790119
TaskFileRemoval       BlockRemoval            0.790119
dtype: float64


### Transforming data into Numpy arrays

In [5]:
to_transform = {"release": {"patch": 0, "minor": 1, "major": 2}}
filtered_data.replace(to_transform, inplace=True)

X = filtered_data[filtered_data.columns[:-1]].to_numpy()
y = filtered_data[filtered_data.columns[-1]].to_numpy()

# Print shapes
print()
print("Shapes of the data:")
print(X.shape)
print(y.shape)


Shapes of the data:
(71722, 41)
(71722,)


### Configuring classifier

In [6]:
clf = RandomForestClassifier(n_jobs=8)

### All feature weights

In [7]:
clf2 = RandomForestClassifier(n_jobs=8)
feature_importances = np.zeros(X.shape[1])

for i, (train, test) in enumerate(StratifiedKFold(n_splits=10).split(X, y)):
    clf2.fit(X[train], y[train])
    feature_importances += clf2.feature_importances_

print("Weights of the features:")
feature_importances /= 10

for feature_name, feature_weight in zip(filtered_data.columns[:-1], feature_importances):
    print(feature_name, "-->", feature_weight)

Weights of the features:
BlockAddition --> 0.04079927819963627
BlockEdit --> 0.006757660056284679
BlockRelocation --> 0.007106680480522672
BlockRemoval --> 0.023472544800970357
RoleVariableAddition --> 0.04403860545855521
RoleVariableEdit --> 0.03387128184208607
RoleVariableRelocation --> 0.001765464722767463
RoleVariableRemoval --> 0.020371089689186498
RoleVarFileAddition --> 0.016124686223035624
RoleVarFileRelocation --> 0.003827046973491953
RoleVarFileRemoval --> 0.007842588134052883
DefaultVariableAddition --> 0.12277418658302694
DefaultVariableEdit --> 0.04928946666031486
DefaultVariableRelocation --> 0.0002572836387056643
DefaultVariableRemoval --> 0.042280091253386054
DefaultVarFileAddition --> 0.0055245663476297365
DefaultVarFileRelocation --> 0.000390875459968689
DefaultVarFileRemoval --> 0.0018337240454194994
DependencyAddition --> 0.008513141125612725
DependencyRemoval --> 0.007132454063582705
HandlerBlockAddition --> 0.004615282337731415
HandlerBlockEdit --> 8.4839405191005

### Feature selection

In [8]:
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(10), scoring='f1_macro')
X_new = rfecv.fit_transform(X, y)

print()
print("Optimal number of features : %d" % rfecv.n_features_)

column_names = filtered_data.columns[:-1].tolist()
selected_features = list()

for i, value in enumerate(rfecv.support_):
    if value:
        selected_features.append(column_names[i])

print()
print("Original Features on Data: ", column_names)
print("Ranking of Features: ", rfecv.ranking_)
print("Selected Features: ", selected_features)

discarded_features = list(set(column_names).difference(set(selected_features)))
print("Discarded Features: ", discarded_features)


Optimal number of features : 37

Original Features on Data:  ['BlockAddition', 'BlockEdit', 'BlockRelocation', 'BlockRemoval', 'RoleVariableAddition', 'RoleVariableEdit', 'RoleVariableRelocation', 'RoleVariableRemoval', 'RoleVarFileAddition', 'RoleVarFileRelocation', 'RoleVarFileRemoval', 'DefaultVariableAddition', 'DefaultVariableEdit', 'DefaultVariableRelocation', 'DefaultVariableRemoval', 'DefaultVarFileAddition', 'DefaultVarFileRelocation', 'DefaultVarFileRemoval', 'DependencyAddition', 'DependencyRemoval', 'HandlerBlockAddition', 'HandlerBlockEdit', 'HandlerBlockRelocation', 'HandlerBlockRemoval', 'HandlerTaskAddition', 'HandlerTaskEdit', 'HandlerTaskRelocation', 'HandlerTaskRemoval', 'HandlerFileAddition', 'HandlerFileRelocation', 'HandlerFileRemoval', 'MetaEdit', 'PlatformAddition', 'PlatformRemoval', 'TaskAddition', 'TaskEdit', 'TaskRelocation', 'TaskRemoval', 'TaskFileAddition', 'TaskFileRelocation', 'TaskFileRemoval']
Ranking of Features:  [1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 

In [20]:
for feature_name, ranking in zip(column_names, rfecv.ranking_):
    print(feature_name, ranking)

BlockAddition 1
BlockEdit 1
BlockRelocation 1
BlockRemoval 1
RoleVariableAddition 1
RoleVariableEdit 1
RoleVariableRelocation 1
RoleVariableRemoval 1
RoleVarFileAddition 1
RoleVarFileRelocation 1
RoleVarFileRemoval 1
DefaultVariableAddition 1
DefaultVariableEdit 1
DefaultVariableRelocation 2
DefaultVariableRemoval 1
DefaultVarFileAddition 1
DefaultVarFileRelocation 1
DefaultVarFileRemoval 1
DependencyAddition 1
DependencyRemoval 1
HandlerBlockAddition 1
HandlerBlockEdit 4
HandlerBlockRelocation 3
HandlerBlockRemoval 1
HandlerTaskAddition 1
HandlerTaskEdit 1
HandlerTaskRelocation 1
HandlerTaskRemoval 1
HandlerFileAddition 1
HandlerFileRelocation 5
HandlerFileRemoval 1
MetaEdit 1
PlatformAddition 1
PlatformRemoval 1
TaskAddition 1
TaskEdit 1
TaskRelocation 1
TaskRemoval 1
TaskFileAddition 1
TaskFileRelocation 1
TaskFileRemoval 1


### Stratified Training

In [9]:
cv = StratifiedKFold(n_splits=10)

X = X_new

precision_scores = list()
recall_scores = list()
confusion_matrices = list()

for i, (train, test) in enumerate(cv.split(X, y)):
    clf.fit(X[train], y[train])
    y_predict = clf.predict(X[test])

    precision_value = precision_score(y[test], y_predict, average="macro")
    recall_value = recall_score(y[test], y_predict, average="macro")
    conf_matrix = confusion_matrix(y[test], y_predict)

    precision_scores.append(precision_value)
    recall_scores.append(recall_value)
    confusion_matrices.append(conf_matrix)

print()
print("Mean of Precisions: ", np.mean(precision_scores))
print("Mean of Recalls: ", np.mean(recall_scores))

print("Confusion matrix:")
print(sum(confusion_matrices))


Mean of Precisions:  0.7265540195231619
Mean of Recalls:  0.5586017902857003
Confusion matrix:
[[49880  3125   232]
 [ 8670  6370   175]
 [ 1626   597  1047]]


### Saving

In [10]:
# Model
pickle.dump(clf, open("data/models/randomForest_old_no_balance.pickle", "wb"))