# 

# 1. Data Preparation
## 1. Feature Selection
Before we can partition our data, we need to select only the targeted features in the specification. This step requires us to select the appropriate features within the data and further clean it before we are ready to train

In [1]:
import os
import glob
import pandas as pd
from pathlib import Path as path
import matplotlib
matplotlib.use('Qt5Agg') 
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFECV, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix,
                             f1_score, precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import StratifiedKFold, train_test_split

  from pandas_profiling import ProfileReport


In order to simplify repertory changes, we'll initialize two variables, containning the paths of this current repository and the path of your clone of the Apache Hive repertory.

In [2]:
project_repo = path("/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model")
hive_repo = path("/home/nicolas-richard/Desktop/.Apache_Hive")

In [3]:
def extract_version(filename):
    file_part = filename.split('/')[-1]

    parts = file_part.split('-')
    if len(parts) < 2:
        raise ValueError("Filename does not contain a version segment.")

    version_part = parts[1].split('_')[0]
    return version_part

In [4]:
input_files = glob.glob(os.path.join(project_repo, 'UND_hive_updated_data', '*.csv'))

concatenated_df = pd.DataFrame()

for file in input_files:
    df = pd.read_csv(file)
    
    version = extract_version(file)
    df.insert(loc=2, column='Version', value=version)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

concatenated_df.info()
count = concatenated_df['Version'].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147744 entries, 0 to 147743
Data columns (total 100 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Bug                                 147744 non-null  int64  
 1   FileName                            147744 non-null  object 
 2   Version                             147744 non-null  object 
 3   AvgCountLine                        147744 non-null  float64
 4   AvgCountLineBlank                   147744 non-null  float64
 5   AvgCountLineCode                    147744 non-null  float64
 6   AvgCountLineComment                 147744 non-null  float64
 7   AvgCyclomatic                       147744 non-null  float64
 8   AvgCyclomaticModified               147744 non-null  float64
 9   AvgCyclomaticStrict                 147744 non-null  float64
 10  AvgCyclomaticStrictModified         147744 non-null  float64
 11  AvgEssential             

In [5]:
print(concatenated_df['Version'].nunique())
concatenated_df.info()

23
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147744 entries, 0 to 147743
Data columns (total 100 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Bug                                 147744 non-null  int64  
 1   FileName                            147744 non-null  object 
 2   Version                             147744 non-null  object 
 3   AvgCountLine                        147744 non-null  float64
 4   AvgCountLineBlank                   147744 non-null  float64
 5   AvgCountLineCode                    147744 non-null  float64
 6   AvgCountLineComment                 147744 non-null  float64
 7   AvgCyclomatic                       147744 non-null  float64
 8   AvgCyclomaticModified               147744 non-null  float64
 9   AvgCyclomaticStrict                 147744 non-null  float64
 10  AvgCyclomaticStrictModified         147744 non-null  float64
 11  AvgEssential          

In [6]:
concatenated_df.drop_duplicates
data = concatenated_df.iloc[:,[0,7,8,9,11,3,4,5,6,20,21,22,27,28,29,33,36,38,39,
                               40,42,43,44,45,46,47,53,54,55,57,64,65,66,75,76,
                               77,78,80,16,17,19,70,73,82,83,84,85,86,87,88,89,
                               90,91,92,93]]
data.dropna(inplace=True)
data = data.reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147340 entries, 0 to 147339
Data columns (total 55 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Bug                        147340 non-null  int64  
 1   AvgCyclomatic              147340 non-null  float64
 2   AvgCyclomaticModified      147340 non-null  float64
 3   AvgCyclomaticStrict        147340 non-null  float64
 4   AvgEssential               147340 non-null  float64
 5   AvgCountLine               147340 non-null  float64
 6   AvgCountLineBlank          147340 non-null  float64
 7   AvgCountLineCode           147340 non-null  float64
 8   AvgCountLineComment        147340 non-null  float64
 9   CountDeclClass             147340 non-null  float64
 10  CountDeclClassMethod       147340 non-null  float64
 11  CountDeclClassVariable     147340 non-null  float64
 12  CountDeclFunction          147340 non-null  float64
 13  CountDeclInstanceMethod    14

We now have 55 columns. This includes all the *46 independent variables* at study (37 related to files, 5 related to classes and 4 related to methods (expanded to 12 for simplicity's sake) mentionned in *Yatish et al., 2019*) in addition to the first column, specify whether there is a bug in a given file. The dataframe is structured as follows

In [7]:
profile = ProfileReport(data, 
                        title='Apache Hive Bug Dataset Profile Report',
                        pool_size=4,
                        html={'style': {'full_width': True}},
                        minimal=True)
profile.to_notebook_iframe()
data.to_csv("Data_Preparation_Output.csv",index=False)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## 2. Outlier Detection and Handling
In this step, we'll get rid of the outliers within our data

In [8]:
numeric_features = data.select_dtypes(include=['float64']).columns.tolist()

def remove_outliers(df, numeric_cols, std_threshold=3):
    df_clean = data.copy()
    
    for col in numeric_cols:
        mean = df_clean[col].mean()
        std = df_clean[col].std()
        df_clean = df_clean[abs(df_clean[col] - mean) <= std_threshold * std]
    
    return df_clean

data = remove_outliers(data, numeric_features)

In [9]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(15, 4, i)
    sns.boxplot(x=data[feature])
    plt.title(f'{feature}')
plt.tight_layout()
plt.show()


  plt.show()


## 3. Data Partitionning
As per the specification of the project, we'll need to test different train-test partitions. We'll settle on a given partition size for each trainning model in the following notebook. We need to do the following at this point in order to follow-up with data engineering.
We have setttled on the following partition train-test partion sizes:
- 30 - 70
- 50 - 50
- 70 - 30
- 80 - 20
- 90 - 10

As we have a highly unbalanced dataset, we'll use Stratified Shuffle Split

In [10]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

X = data.drop('Bug', axis=1)
Y = data['Bug']

for train_index, test_index in sss.split(X, Y):
    X_train_full, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_full, y_test = Y.iloc[train_index], Y.iloc[test_index]

train_sizes = [0.3, 0.5, 0.7, 0.8, 0.9]
train_subsets = []

for size in train_sizes:
    sss_subset = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=42)
    for train_subset_index, _ in sss_subset.split(X_train_full, y_train_full):
        X_train_subset = X_train_full.iloc[train_subset_index]
        Y_train_subset = y_train_full.iloc[train_subset_index]
        train_subsets.append((X_train_subset, Y_train_subset))

for i, (X_subset, Y_subset) in enumerate(train_subsets):
    print(f"\nTraining Size {train_sizes[i]*100}% - Class Distribution:\n", Y_subset.value_counts(normalize=True))


Training Size 30.0% - Class Distribution:
 Bug
0    0.992689
1    0.007311
Name: proportion, dtype: float64

Training Size 50.0% - Class Distribution:
 Bug
0    0.992677
1    0.007323
Name: proportion, dtype: float64

Training Size 70.0% - Class Distribution:
 Bug
0    0.992672
1    0.007328
Name: proportion, dtype: float64

Training Size 80.0% - Class Distribution:
 Bug
0    0.992682
1    0.007318
Name: proportion, dtype: float64

Training Size 90.0% - Class Distribution:
 Bug
0    0.992669
1    0.007331
Name: proportion, dtype: float64


## 4. Feature & Correlation Analysis
Here, we want to remove low variance features in or trainning sets and to identify and remove highly correlated features

In [11]:
selector = VarianceThreshold(threshold=0.01)

for i, (X_subset, Y_subset) in enumerate(train_subsets):
    print(f"\nProcessing Subset {i + 1}")

    X_subset_transformed = selector.fit_transform(X_subset)

    features_kept = X_subset.columns[selector.get_support(indices=True)]

    X_subset = pd.DataFrame(X_subset_transformed, columns=features_kept)

    corr_matrix = X_subset.corr().abs()

    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

    grouped_features = {
        'AvgCyclomatic': ['AvgCyclomaticModified', 'AvgCyclomaticStrict'],
        'LineCounts': ['CountLine', 'CountLineCode', 'CountLineCodeExe', 'CountLineBlank']
    }
    for group_name, group_features in grouped_features.items():
        if set(group_features).issubset(X_subset.columns):
            X_subset[group_name] = X_subset[group_features].mean(axis=1)
            to_drop.extend(group_features)

    to_drop = list(set(to_drop))

    X_reduced = X_subset.drop(columns=to_drop)

    print(f"Removed columns: {to_drop}")
    print(f"Subset {i + 1}: {len(to_drop)} columns removed")

    train_subsets[i] = (X_reduced, Y_subset)


Processing Subset 1
Removed columns: ['CountLineBlank', 'MaxCyclomaticModified', 'AvgCountLineCode', 'AvgCyclomaticStrict', 'CountDeclInstanceMethod', 'CountStmtDecl', 'CountDeclMethod', 'CountStmtExe', 'MaxCyclomaticStrict', 'CountLineCodeExe', 'AvgCyclomaticModified', 'CountLineCode', 'SumCyclomaticModified', 'SumCyclomaticStrict', 'CountLine', 'CountStmt']
Subset 1: 16 columns removed

Processing Subset 2
Removed columns: ['CountLineBlank', 'MaxCyclomaticModified', 'AvgCountLineCode', 'AvgCyclomaticStrict', 'CountDeclInstanceMethod', 'CountStmtDecl', 'CountDeclMethod', 'CountStmtExe', 'MaxCyclomaticStrict', 'CountLineCodeExe', 'AvgCyclomaticModified', 'CountLineCode', 'SumCyclomaticModified', 'SumCyclomaticStrict', 'CountLine', 'CountStmt']
Subset 2: 16 columns removed

Processing Subset 3
Removed columns: ['CountLineBlank', 'MaxCyclomaticModified', 'AvgCountLineCode', 'AvgCyclomaticStrict', 'CountDeclInstanceMethod', 'CountStmtDecl', 'CountDeclMethod', 'CountStmtExe', 'MaxCyclomat

## 5. *Synthetic Minority Oversampling Technique*

In [12]:
for i, (X_subset, Y_subset) in enumerate(train_subsets):
    print(f"\nOriginal Class Distribution for {train_sizes[i]*100}% Training Size:")
    print(Counter(Y_subset))

    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_subset, Y_subset)

    print(f"Resampled Class Distribution for {train_sizes[i]*100}% Training Size:")
    print(Counter(y_train_res))


Original Class Distribution for 30.0% Training Size:
Counter({0: 16022, 1: 118})
Resampled Class Distribution for 30.0% Training Size:
Counter({0: 16022, 1: 16022})

Original Class Distribution for 50.0% Training Size:
Counter({0: 26704, 1: 197})
Resampled Class Distribution for 50.0% Training Size:
Counter({0: 26704, 1: 26704})

Original Class Distribution for 70.0% Training Size:
Counter({0: 37386, 1: 276})
Resampled Class Distribution for 70.0% Training Size:
Counter({0: 37386, 1: 37386})

Original Class Distribution for 80.0% Training Size:
Counter({0: 42727, 1: 315})
Resampled Class Distribution for 80.0% Training Size:
Counter({0: 42727, 1: 42727})

Original Class Distribution for 90.0% Training Size:
Counter({0: 48067, 1: 355})
Resampled Class Distribution for 90.0% Training Size:
Counter({0: 48067, 1: 48067})


In [13]:
for i, (X_subset, Y_subset) in enumerate(train_subsets):
    print(f"{train_sizes[i]*100}% Training Size:")
    print("Features/Columns in this subset:")
    print(X_subset.columns.tolist())
    print(f"Number of features/columns: {len(X_subset.columns)}\n\n")

30.0% Training Size:
Features/Columns in this subset:
['AvgCyclomatic', 'AvgEssential', 'AvgCountLine', 'AvgCountLineBlank', 'AvgCountLineComment', 'CountDeclClass', 'CountDeclClassMethod', 'CountDeclClassVariable', 'CountDeclFunction', 'CountDeclInstanceVariable', 'CountDeclMethodDefault', 'CountDeclMethodPrivate', 'CountDeclMethodProtected', 'CountDeclMethodPublic', 'CountLineCodeDecl', 'CountLineComment', 'CountSemicolon', 'MaxCyclomatic', 'RatioCommentToCode', 'SumCyclomatic', 'SumEssential', 'CountInputMin', 'CountInputMean', 'CountInputMax', 'CountOutputMin', 'CountOutputMean', 'CountOutputMax', 'CountPathMin', 'CountPathMean', 'CountPathMax', 'MaxNestingMin', 'MaxNestingMean', 'MaxNestingMax', 'LineCounts']
Number of features/columns: 34


50.0% Training Size:
Features/Columns in this subset:
['AvgCyclomatic', 'AvgEssential', 'AvgCountLine', 'AvgCountLineBlank', 'AvgCountLineComment', 'CountDeclClass', 'CountDeclClassMethod', 'CountDeclClassVariable', 'CountDeclFunction', 'Count

# 2. Trainning with Cross-Validation

## 1. Logistic Regression

In [14]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

log_reg = LogisticRegression(
    solver='saga',
    penalty='l1',
    max_iter=10000,
    n_jobs=-1,
    random_state=42
)

rfecv_log_reg = RFECV(
    estimator=log_reg,
    step=0.2,
    cv=cv,
    scoring='roc_auc',
    verbose=0,
    n_jobs=-1,
    min_features_to_select=10
)

rfecv_log_reg_pipeline = Pipeline([
    ('scaler', StandardScaler()),  #
    ('rfecv', rfecv_log_reg)
])

for i, (X_subset, Y_subset) in enumerate(train_subsets):
    print(f"\nTraining on Subset {i + 1}:")

    sss = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    for train_index, val_index in sss.split(X_subset, Y_subset):
        X_train, X_val = X_subset.iloc[train_index], X_subset.iloc[val_index]
        Y_train, Y_val = Y_subset.iloc[train_index], Y_subset.iloc[val_index]

    rfecv_log_reg_pipeline.fit(X_train, Y_train)

    y_pred_log_reg = rfecv_log_reg_pipeline.predict(X_val)
    y_pred_proba_log_reg = rfecv_log_reg_pipeline.predict_proba(X_val)[:, 1]

    roc_auc = roc_auc_score(Y_val, y_pred_proba_log_reg)
    print(f"ROC-AUC Score for Logistic Regression on Subset {i + 1}: {roc_auc:.4f}")
    print(classification_report(Y_val, y_pred_log_reg))



Training on Subset 1:
ROC-AUC Score for Logistic Regression on Subset 1: 0.6341
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5340
           1       0.00      0.00      0.00        40

    accuracy                           0.99      5380
   macro avg       0.50      0.50      0.50      5380
weighted avg       0.99      0.99      0.99      5380


Training on Subset 2:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC-AUC Score for Logistic Regression on Subset 2: 0.6041
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      8901
           1       0.00      0.00      0.00        66

    accuracy                           0.99      8967
   macro avg       0.50      0.50      0.50      8967
weighted avg       0.99      0.99      0.99      8967


Training on Subset 3:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC-AUC Score for Logistic Regression on Subset 3: 0.6572
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     12462
           1       0.00      0.00      0.00        92

    accuracy                           0.99     12554
   macro avg       0.50      0.50      0.50     12554
weighted avg       0.99      0.99      0.99     12554


Training on Subset 4:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

## 2. Random Forest

In [None]:
random_forest = RandomForestClassifier(
    n_estimators=1500,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

rfecv_rf = RFECV(
    estimator=random_forest,
    step=0.2,  # Remove 20% of features per iteration
    cv=cv,
    scoring='roc_auc',
    verbose=0,
    n_jobs=-1,
    min_features_to_select=10
)

variance_threshold = VarianceThreshold(threshold=0.01)  # Example threshold
rfecv_rf_pipeline = Pipeline([
    ('variance_threshold', variance_threshold),
    ('scaler', StandardScaler()),
    ('rfecv', rfecv_rf)
])

for i, (X_subset, Y_subset) in enumerate(train_subsets):
    print(f"\nTraining on Subset {i + 1}:")

    sss = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    for train_index, val_index in sss.split(X_subset, Y_subset):
        X_train, X_val = X_subset.iloc[train_index], X_subset.iloc[val_index]
        Y_train, Y_val = Y_subset.iloc[train_index], Y_subset.iloc[val_index]

    rfecv_rf_pipeline.fit(X_train, Y_train)

    y_pred_rf = rfecv_rf_pipeline.predict(X_val)
    y_pred_proba_rf = rfecv_rf_pipeline.predict_proba(X_val)[:, 1]

    roc_auc = roc_auc_score(Y_val, y_pred_proba_rf)
    print(f"ROC-AUC Score for Random Forest on Subset {i + 1}: {roc_auc:.4f}")
    print(classification_report(Y_val, y_pred_rf))

    selected_features = rfecv_rf.support_
    print(f"Selected Features for Subset {i + 1}: {X_train.columns[selected_features].tolist()}")


Training on Subset 1:
ROC-AUC Score for Random Forest on Subset 1: 0.6495
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5340
           1       0.00      0.00      0.00        40

    accuracy                           0.99      5380
   macro avg       0.50      0.50      0.50      5380
weighted avg       0.99      0.99      0.99      5380

Selected Features for Subset 1: ['AvgCountLine', 'CountDeclClassMethod', 'CountDeclFunction', 'CountDeclInstanceVariable', 'CountDeclMethodDefault', 'CountDeclMethodProtected', 'CountDeclMethodPublic', 'CountLineCodeDecl', 'CountLineComment', 'CountSemicolon', 'RatioCommentToCode', 'SumCyclomatic', 'SumEssential', 'CountInputMin', 'CountInputMean', 'CountInputMax', 'CountOutputMin', 'CountOutputMean', 'CountOutputMax', 'CountPathMean', 'MaxNestingMean', 'LineCounts']

Training on Subset 2:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC-AUC Score for Random Forest on Subset 2: 0.7028
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      8901
           1       0.00      0.00      0.00        66

    accuracy                           0.99      8967
   macro avg       0.50      0.50      0.50      8967
weighted avg       0.99      0.99      0.99      8967

Selected Features for Subset 2: ['AvgCyclomatic', 'AvgEssential', 'AvgCountLine', 'AvgCountLineBlank', 'AvgCountLineComment', 'CountDeclClass', 'CountDeclClassMethod', 'CountDeclClassVariable', 'CountDeclFunction', 'CountDeclInstanceVariable', 'CountDeclMethodDefault', 'CountDeclMethodPrivate', 'CountDeclMethodProtected', 'CountDeclMethodPublic', 'CountLineCodeDecl', 'CountLineComment', 'CountSemicolon', 'MaxCyclomatic', 'RatioCommentToCode', 'SumCyclomatic', 'SumEssential', 'CountInputMin', 'CountInputMean', 'CountInputMax', 'CountOutputMin', 'CountOutputMean', 'CountOutputMax', 'CountPathMin', 'CountPathMean', '

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC-AUC Score for Random Forest on Subset 3: 0.7216
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     12462
           1       0.00      0.00      0.00        92

    accuracy                           0.99     12554
   macro avg       0.50      0.50      0.50     12554
weighted avg       0.99      0.99      0.99     12554

Selected Features for Subset 3: ['AvgCountLine', 'CountDeclClassVariable', 'CountDeclFunction', 'CountDeclInstanceVariable', 'CountDeclMethodPublic', 'CountLineCodeDecl', 'CountLineComment', 'CountSemicolon', 'MaxCyclomatic', 'RatioCommentToCode', 'SumCyclomatic', 'SumEssential', 'CountInputMin', 'CountInputMean', 'CountInputMax', 'CountOutputMin', 'CountOutputMean', 'CountOutputMax', 'CountPathMean', 'CountPathMax', 'MaxNestingMean', 'LineCounts']

Training on Subset 4:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC-AUC Score for Random Forest on Subset 4: 0.7456
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     14242
           1       0.00      0.00      0.00       105

    accuracy                           0.99     14347
   macro avg       0.50      0.50      0.50     14347
weighted avg       0.99      0.99      0.99     14347

Selected Features for Subset 4: ['AvgCyclomatic', 'AvgCountLine', 'AvgCountLineBlank', 'CountDeclClassMethod', 'CountDeclClassVariable', 'CountDeclFunction', 'CountDeclInstanceVariable', 'CountDeclMethodDefault', 'CountDeclMethodPrivate', 'CountDeclMethodProtected', 'CountDeclMethodPublic', 'CountLineCodeDecl', 'CountLineComment', 'CountSemicolon', 'MaxCyclomatic', 'RatioCommentToCode', 'SumCyclomatic', 'SumEssential', 'CountInputMin', 'CountInputMean', 'CountInputMax', 'CountOutputMin', 'CountOutputMean', 'CountOutputMax', 'CountPathMean', 'CountPathMax', 'MaxNestingMean', 'LineCounts']

Training on Subset 5:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC-AUC Score for Random Forest on Subset 5: 0.7340
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     16022
           1       0.00      0.00      0.00       118

    accuracy                           0.99     16140
   macro avg       0.50      0.50      0.50     16140
weighted avg       0.99      0.99      0.99     16140

Selected Features for Subset 5: ['AvgCountLine', 'AvgCountLineBlank', 'AvgCountLineComment', 'CountDeclClassMethod', 'CountDeclClassVariable', 'CountDeclFunction', 'CountDeclInstanceVariable', 'CountDeclMethodDefault', 'CountDeclMethodPrivate', 'CountDeclMethodProtected', 'CountDeclMethodPublic', 'CountLineCodeDecl', 'CountLineComment', 'CountSemicolon', 'MaxCyclomatic', 'RatioCommentToCode', 'SumCyclomatic', 'SumEssential', 'CountInputMin', 'CountInputMean', 'CountInputMax', 'CountOutputMin', 'CountOutputMean', 'CountOutputMax', 'CountPathMean', 'CountPathMax', 'MaxNestingMean', 'LineCounts']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
