### Loading libraries

In [1]:
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

### Loading data

In [2]:
dataframe = pd.read_csv("data/new_data/old_records.csv.gz")

dataframe.columns

Index(['id', 'v1', 'v2', 'BlockAddition', 'BlockEdit', 'BlockRelocation',
       'BlockRemoval', 'RoleVariableAddition', 'RoleVariableEdit',
       'RoleVariableRelocation', 'RoleVariableRemoval', 'RoleVarFileAddition',
       'RoleVarFileRelocation', 'RoleVarFileRemoval',
       'DefaultVariableAddition', 'DefaultVariableEdit',
       'DefaultVariableRelocation', 'DefaultVariableRemoval',
       'DefaultVarFileAddition', 'DefaultVarFileRelocation',
       'DefaultVarFileRemoval', 'DependencyAddition', 'DependencyRemoval',
       'HandlerBlockAddition', 'HandlerBlockEdit', 'HandlerBlockRelocation',
       'HandlerBlockRemoval', 'HandlerTaskAddition', 'HandlerTaskEdit',
       'HandlerTaskRelocation', 'HandlerTaskRemoval', 'HandlerFileAddition',
       'HandlerFileRelocation', 'HandlerFileRemoval', 'MetaEdit',
       'PlatformAddition', 'PlatformRemoval', 'TaskAddition', 'TaskEdit',
       'TaskRelocation', 'TaskRemoval', 'TaskFileAddition',
       'TaskFileRelocation', 'TaskFileRemoval

### Checking null values

In [3]:
data_null_values = dataframe[dataframe.isna().any(axis=1)]
len_null = len(data_null_values)
len_data = len(dataframe)

# Print message
print("Validity of the data:")
print(f"{len_null} out of {len_data} contains at least one null value", end=" ")
print(f"representing {len_null / len_data} % of the instances")

# Discarding the first three columns and the column with all values as zero
filtered_data = dataframe.drop(['id', 'v1', 'v2'], axis=1)

Validity of the data:
0 out of 71722 contains at least one null value representing 0.0 % of the instances


### Correlation analysis

In [4]:
correlation_matrix = filtered_data.corr()
unstacked_correlation_matrix = correlation_matrix.unstack()
sorted_values = unstacked_correlation_matrix.sort_values(kind="quicksort")
print()
print(sorted_values[1630:1640])


TaskEdit              TaskRelocation          0.733124
TaskRelocation        TaskEdit                0.733124
HandlerTaskAddition   HandlerBlockAddition    0.752078
HandlerBlockAddition  HandlerTaskAddition     0.752078
BlockAddition         TaskAddition            0.762167
TaskAddition          BlockAddition           0.762167
TaskFileAddition      BlockAddition           0.764727
BlockAddition         TaskFileAddition        0.764727
BlockRemoval          TaskFileRemoval         0.790119
TaskFileRemoval       BlockRemoval            0.790119
dtype: float64


### Transform data into Numpy arrays

In [5]:
to_transform = {"release": {"patch": 0, "minor": 1, "major": 2}}
filtered_data.replace(to_transform, inplace=True)

# Transforming data into Numpy arrays
X = filtered_data[filtered_data.columns[:-1]].to_numpy()
y = filtered_data[filtered_data.columns[-1]].to_numpy()

# Print shapes
print()
print("Shapes of the data:")
print(X.shape)
print(y.shape)


Shapes of the data:
(71722, 41)
(71722,)


### Balancing

#### Undersampling 

In [6]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss, NeighbourhoodCleaningRule
from imblearn.pipeline import Pipeline

undersample = NeighbourhoodCleaningRule(n_jobs=-1)

print((len(X[y==0]), len(X[y==1]), len(X[y==2])))

X, y = undersample.fit_resample(X, y)

# Print shapes
print()
print("Shapes of the data:")
print(X.shape)
print(y.shape)

print()
print((len(X[y==0]), len(X[y==1]), len(X[y==2])))

(53237, 15215, 3270)

Shapes of the data:
(58451, 41)
(58451,)

(48639, 6542, 3270)


### Configuring classifier

In [7]:
clf = RandomForestClassifier(n_jobs=8)

### Feature selection

In [8]:
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(10), scoring='f1_macro')
X_new = rfecv.fit_transform(X, y)

print()
print("Optimal number of features : %d" % rfecv.n_features_)

column_names = filtered_data.columns[:-1].tolist()
selected_features = list()

for i, value in enumerate(rfecv.support_):
    if value:
        selected_features.append(column_names[i])

print()
print("Original Features on Data: ", column_names)
print("Ranking of Features: ", rfecv.ranking_)
print("Selected Features: ", selected_features)

discarded_features = list(set(column_names).difference(set(selected_features)))
print("Discarded Features: ", discarded_features)


Optimal number of features : 37

Original Features on Data:  ['BlockAddition', 'BlockEdit', 'BlockRelocation', 'BlockRemoval', 'RoleVariableAddition', 'RoleVariableEdit', 'RoleVariableRelocation', 'RoleVariableRemoval', 'RoleVarFileAddition', 'RoleVarFileRelocation', 'RoleVarFileRemoval', 'DefaultVariableAddition', 'DefaultVariableEdit', 'DefaultVariableRelocation', 'DefaultVariableRemoval', 'DefaultVarFileAddition', 'DefaultVarFileRelocation', 'DefaultVarFileRemoval', 'DependencyAddition', 'DependencyRemoval', 'HandlerBlockAddition', 'HandlerBlockEdit', 'HandlerBlockRelocation', 'HandlerBlockRemoval', 'HandlerTaskAddition', 'HandlerTaskEdit', 'HandlerTaskRelocation', 'HandlerTaskRemoval', 'HandlerFileAddition', 'HandlerFileRelocation', 'HandlerFileRemoval', 'MetaEdit', 'PlatformAddition', 'PlatformRemoval', 'TaskAddition', 'TaskEdit', 'TaskRelocation', 'TaskRemoval', 'TaskFileAddition', 'TaskFileRelocation', 'TaskFileRemoval']
Ranking of Features:  [1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 

### Stratified Training

In [9]:
cv = StratifiedKFold(n_splits=10)

X = X_new

precision_scores = list()
recall_scores = list()
confusion_matrices = list()

for i, (train, test) in enumerate(cv.split(X, y)):
    clf.fit(X[train], y[train])
    y_predict = clf.predict(X[test])

    precision_value = precision_score(y[test], y_predict, average="macro")
    recall_value = recall_score(y[test], y_predict, average="macro")
    conf_matrix = confusion_matrix(y[test], y_predict)

    precision_scores.append(precision_value)
    recall_scores.append(recall_value)
    confusion_matrices.append(conf_matrix)

print()
print("Mean of Precisions: ", np.mean(precision_scores))
print("Mean of Recalls: ", np.mean(recall_scores))
    
print("Confusion matrix:")
print(sum(confusion_matrices))


Mean of Precisions:  0.8371557497695594
Mean of Recalls:  0.7565171016649413
Confusion matrix:
[[47625   667   347]
 [  585  5769   188]
 [ 1445   489  1336]]


### Saving

In [11]:
# Model
pickle.dump(clf, open("data/models/randomForest_old_data.pickle", "wb"))

# Data Transformed
pickle.dump(X, open("data/transformed_data/X_train_old.pickle", "wb"))
pickle.dump(y, open("data/transformed_data/y_train_old.pickle", "wb"))

# Feature-related
pickle.dump(column_names, open("data/features/features.pickle", "wb"))
pickle.dump(rfecv.ranking_, open("data/features/ranking_features.pickle", "wb"))
pickle.dump(selected_features, open("data/features/selected_features.pickle", "wb"))
pickle.dump(discarded_features, open("data/features/discarded_features.pickle", "wb"))