In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
test = pd.read_csv('metrics_data/cloudify_metrics.csv', low_memory=True)
train = pd.read_csv('metrics_data/gradle_metrics copy 2.csv', low_memory=True)

In [3]:
test['is_train'] = 0
train['is_train'] = 1 

In [4]:
df_combine = pd.concat([train, test], axis=0, ignore_index=True)

In [5]:
y = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables

In [6]:
m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y.shape)

In [28]:
skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)):
 X_train, X_test = x[train_idx], x[test_idx]
 y_train, y_test = y[train_idx], y[test_idx]
 
 m.fit(X_train, y_train)
 probs = m.predict_proba(X_test)[:, 1] #calculating the probability
 predictions[test_idx] = probs

In [29]:
print("ROC-AUC for train and test distributions:", auc(y, predictions))

ROC-AUC for train and test distributions: 0.49531862488662765


In [30]:
print(predictions)

[0.87415043 0.79598332 0.8411511  ... 0.33158128 0.20217515 0.03843564]


In [31]:
print(X_test)

[[20169913       38        0 ...        0     1170     1170]
 [21072770        2        0 ...        0      966      966]
 [21368488      952        0 ...        0     1115     1115]
 ...
 [24881389        4        0 ...        0      191      191]
 [25182822        4        0 ...        0      195      195]
 [25480294        4        0 ...        0      199      199]]


In [32]:
probs = m.predict(X_test)
predictions[test_idx] = probs

In [33]:
print(probs)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [27]:
print("ROC-AUC for train and test distributions:", auc(y, predictions))

ROC-AUC for train and test distributions: 0.494937022850195


In [34]:
with np.printoptions(threshold=np.inf):
    print(y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [35]:
with np.printoptions(threshold=np.inf):
    print(predictions)

[8.74150433e-01 7.95983321e-01 8.41151097e-01 8.40701521e-01
 1.00000000e+00 9.69723739e-01 9.07824172e-01 2.99935286e-01
 3.01927242e-01 6.62701679e-01 9.21676534e-01 9.11078189e-01
 9.45697738e-01 9.29648156e-01 2.63585054e-01 7.57014959e-01
 9.43648321e-01 9.73374358e-01 9.05152213e-01 9.53712810e-01
 9.61609155e-01 9.12152852e-01 9.74130326e-01 2.63421627e-01
 4.44305560e-01 3.82212665e-01 9.69422830e-01 9.21963793e-01
 9.56565834e-01 3.86862220e-01 9.27484562e-01 6.32827239e-01
 9.53678999e-01 9.34463296e-01 9.57158081e-01 9.54981147e-01
 9.57705109e-01 9.61119978e-01 9.52741980e-01 9.61607315e-01
 9.51148298e-01 9.01814283e-01 9.65140235e-01 9.34135078e-01
 9.57433102e-01 9.58810564e-01 9.45726512e-01 9.64766542e-01
 9.76509470e-01 9.73683751e-01 9.45618634e-01 9.88970000e-01
 9.54651023e-01 9.44950391e-01 9.66419924e-01 9.94560673e-01
 9.85582113e-01 9.61542270e-01 9.71388333e-01 9.83764505e-01
 9.77281490e-01 9.26122797e-01 9.22539258e-01 1.00000000e+00
 4.07850917e-01 9.745813