### Loading libraries

In [1]:
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

### Loading data

In [2]:
dataframe = pd.read_csv("data/new_data/old_records.csv.gz")

### Droping null values and discarded features from the previous step

In [3]:
data_null_values = dataframe[dataframe.isna().any(axis=1)]
len_null = len(data_null_values)
len_data = len(dataframe)

# Print message
print("Validity of the data:")
print(f"{len_null} out of {len_data} contains at least one null value", end=" ")
print(f"representing {len_null / len_data} % of the instances")

cleaned_data = dataframe.dropna(how='any', axis=0)
float_columns = cleaned_data.select_dtypes(include=['float64'])

for col in float_columns.columns.values:
    cleaned_data[col] = cleaned_data[col].astype('int64')
    
previous_discarded_features = ['TaskFileRelocation', 'RoleVariableRelocation', 'HandlerBlockEdit', 
                               'DefaultVarFileRemoval', 'DefaultVarFileRelocation', 'HandlerFileRemoval', 
                               'DefaultVariableRelocation', 'HandlerBlockRelocation', 'RoleVarFileRelocation', 
                               'HandlerFileRelocation', 'HandlerTaskRemoval', 'HandlerFileAddition']

excluded_data = ['id', 'v1', 'v2', 'HandlerBlockAddition'] + previous_discarded_features

filtered_data = cleaned_data.drop(excluded_data, axis=1)

Validity of the data:
0 out of 66137 contains at least one null value representing 0.0 % of the instances


### Transform data into Numpy arrays

In [98]:
to_transform = {"release": {"patch": 0, "minor": 1, "major": 2}}
filtered_data.replace(to_transform, inplace=True)

# Transforming data into Numpy arrays
X = filtered_data[filtered_data.columns[:-1]].to_numpy()
y = filtered_data[filtered_data.columns[-1]].to_numpy()

# Print shapes
print()
print("Shapes of the data:")
print(X.shape)
print(y.shape)


Shapes of the data:
(66137, 28)
(66137,)


### Configuring Undersampling Techniques

In [99]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss, NeighbourhoodCleaningRule, TomekLinks, OneSidedSelection

# undersampler = RandomUnderSampler()
# undersampler = NearMiss()
# undersampler = NeighbourhoodCleaningRule(n_jobs=-1)
undersampler = TomekLinks(n_jobs=-1)
# undersampler = OneSidedSelection(n_neighbors=5, n_jobs=-1)

### Configuring classifier

In [100]:
clf = RandomForestClassifier(n_jobs=8)

### Making pipeline

In [106]:
from imblearn.pipeline import make_pipeline

pipeline = make_pipeline(undersampler, clf)

### Stratified Training

In [107]:
cv = StratifiedKFold(n_splits=10)

precision_scores = list()
recall_scores = list()
confusion_matrices = list()

for i, (train, test) in enumerate(cv.split(X, y)):
    y_predict = pipeline.fit(X[train], y[train]).predict(X[test])

    precision_value = precision_score(y[test], y_predict, average="macro")
    recall_value = recall_score(y[test], y_predict, average="macro")
    conf_matrix = confusion_matrix(y[test], y_predict)

    precision_scores.append(precision_value)
    recall_scores.append(recall_value)
    confusion_matrices.append(conf_matrix)

print()
print("Mean of Precisions: ", np.mean(precision_scores))
print("Mean of Recalls: ", np.mean(recall_scores))
    
print("Confusion matrix:")
print(sum(confusion_matrices))


Mean of Precisions:  0.7228779637924349
Mean of Recalls:  0.565713583471212
Confusion matrix:
[[44856  2884   233]
 [ 8474  6315   216]
 [ 1499   582  1078]]


### Saving

In [17]:
# Model
pickle.dump(clf, open("data/models/randomForest_old_data.pickle", "wb"))

# Data Transformed
pickle.dump(X, open("data/transformed_data/X_train_old.pickle", "wb"))
pickle.dump(y, open("data/transformed_data/y_train_old.pickle", "wb"))

# Feature-related
pickle.dump(column_names, open("data/features/features.pickle", "wb"))
pickle.dump(rfecv.ranking_, open("data/features/ranking_features.pickle", "wb"))
pickle.dump(selected_features, open("data/features/selected_features.pickle", "wb"))
pickle.dump(discarded_features, open("data/features/discarded_features.pickle", "wb"))