In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#1. LOADING & PRE-PROCESSING CLASS-LEVEL DATASET
# Load class-level dataset
df = pd.read_csv('Data/class-smell2.csv', low_memory=False)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373400 entries, 0 to 373399
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Address                   373400 non-null  object 
 1   Brain Class               373400 non-null  bool   
 2   Data Class                373400 non-null  bool   
 3   Futile Abstract Pipeline  373400 non-null  bool   
 4   Futile Hierarchy          373400 non-null  bool   
 5   God Class                 373400 non-null  bool   
 6   Hierarchy Duplication     373400 non-null  bool   
 7   Model Class               373400 non-null  bool   
 8   Schizofrenic Class        373400 non-null  bool   
 9   ABUSEINH                  373400 non-null  int64  
 10  AMW                       373400 non-null  float64
 11  ATFD                      373400 non-null  int64  
 12  BOvM                      373400 non-null  int64  
 13  BUR                       373400 non-null  f

In [9]:
# Check missing data in dataset
for col in df.columns:
  missing_data=df[col].isna().sum()
  if (missing_data>0):
    print(f"column {col} has {missing_data} missing data")

In [10]:
# Define and initialise a predictive result dataset
rs= pd.DataFrame({'Code_smell':[],'Algo':[],'Balance':[],'Ratio':[] , 'Accuracy':[],'Precision':[], 'F1_score':[],'AUC':[]})

In [11]:
# 2. BUILDING THE MACHINELEARNING MODEL

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, roc_auc_score, f1_score
def _train_and_test(model, _data_train, algo):
  global newResult, accuracy,precision,f1,roc
  model.fit(_data_train[features], _data_train[target])
  predictions = model.predict_proba(data_test[features])
  pred_label = model.predict(data_test[features]) 
  accuracy = accuracy_score(data_test[target], pred_label)
  precision = precision_score(data_test[target], pred_label)
  f1 = f1_score(data_test[target], pred_label)
  roc = roc_auc_score(data_test[target], predictions[:,1])
  print('{} Accuracy score on test: {}'.format(algo, accuracy))
  print('{} Precision score on test: {}'.format(algo, precision))
  print('{} ROC score on test: {}'.format(algo, roc))
  print('{} F1 score on test: {}'.format(algo, f1))
  print('{} Classification Report: '.format(algo))
  print(classification_report(data_test[target], pred_label))
  newResult = {'Code_smell':target,'Algo':_algo,'Balance':_balance,'Ratio':_ratio , 'Accuracy':accuracy,'Precision':precision, 'F1_score':f1,'AUC':roc}
  return newResult

In [12]:
# 3. SEQUENTLY, CODE SMELL PREDICTING BY EACH OTHER MODELS
features = list(df.select_dtypes(include=['int64', 'float64']).columns)
target = 'Brain Class'
df[target] = df[target].astype(int)

In [13]:
# Split the Brain-class dataset into subsets: training-set, validation-set, and testing-set.
y = df[target]
X = df[features]

id_pos = np.where(y.values.reshape(-1) == 1)[0]
id_neg = np.where(y.values.reshape(-1) == 0)[0]

np.random.shuffle(id_pos)
np.random.shuffle(id_neg)

train_pos_size = 500
train_neg_size = 223500
val_pos_size = 170
val_neg_size = 74500

In [14]:
# Creating training-set:
id_train_pos = id_pos[:train_pos_size]
id_train_neg = id_neg[:train_neg_size] 
id_train = np.concatenate((id_train_pos, id_train_neg), axis = 0)

In [15]:
# Creating validation-set:
id_val_pos = id_pos[train_pos_size:(train_pos_size + val_pos_size)]
id_val_neg = id_neg[train_neg_size:(train_neg_size + val_neg_size)]
id_val = np.concatenate((id_val_pos, id_val_neg), axis = 0)

In [16]:
# Creating testing-set:
id_test_pos = id_pos[(train_pos_size + val_pos_size):(train_pos_size + 2*val_pos_size)]
id_test_neg = id_neg[(train_neg_size + val_neg_size):(train_neg_size + 2*val_neg_size)]
id_test = np.concatenate((id_test_pos, id_test_neg), axis = 0)

In [17]:
# initialize datasets
data_train = df.iloc[id_train]
data_val = df.iloc[id_val]
data_test = df.iloc[id_test] 

In [18]:
## Using the Undersampling method, balancing the training-set in different ratios 
# Create the training-set in the ratio 80:20 (~ 4*train_pos_size:train_pos_size) by keeping 4*train_pos_size random negative samples from it.
np.random.shuffle(id_train_neg)
id_train_neg_80_20 = id_train_neg[:4*train_pos_size]
id_train_80_20 = np.concatenate((id_train_neg_80_20, id_train_pos), axis = 0)


In [19]:
# Create the training-set in the ratio 75:25 (~ 3*train_pos_size:train_pos_size) by keeping 3*train_pos_size random negative samples from it.
np.random.shuffle(id_train_neg)
id_train_neg_75_25 = id_train_neg[:3*train_pos_size]
id_train_75_25 = np.concatenate((id_train_neg_75_25, id_train_pos), axis = 0) 

In [20]:
# Create the training-set in the ratio 60:40 (~ 1.5*train_pos_size:train_pos_size) by keeping 1.5*train_pos_size random negative samples from it.
np.random.shuffle(id_train_neg)
id_train_neg_60_40 = id_train_neg[:int(1.5*train_pos_size)]
id_train_60_40 = np.concatenate((id_train_neg_60_40, id_train_pos), axis = 0) 


In [21]:
# initialize training-set
data_train_80_20 = df.iloc[id_train_80_20]
data_train_75_25 = df.iloc[id_train_75_25]
data_train_60_40 = df.iloc[id_train_60_40]

In [18]:
#The validation-set is used for model tuning to determine the best-selected model.
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
import matplotlib.pyplot as plt

model_1 = RandomForestClassifier(n_estimators=100,
                                max_depth=5,
                                min_samples_split=200,
                                class_weight=None,
                                max_features=10)

model_2 = RandomForestClassifier(n_estimators=500, 
                                max_depth=10, 
                                min_samples_split=400, 
                                random_state=12, 
                                class_weight="balanced",
                                max_features="sqrt")

model_3 = RandomForestClassifier(n_estimators=800, 
                                max_depth=10, 
                                min_samples_split=200, 
                                random_state=12, 
                                class_weight="balanced",
                                max_features="sqrt")

def _tunning_model(model , X_train, y_train, X_val, y_val):
  model.fit(X_train, y_train)
  model_predictions = model.predict_proba(X_val)
  model_pred = model.predict(X_val[features]) 
  model_roc_score = roc_auc_score(y_val, 
                                  model_predictions[:,1])
  model_f1_score = f1_score(y_val, model_pred)
  return model, model_roc_score, model_f1_score

model_1, model_1_roc_score, model_1_f1_score = _tunning_model(model_1, 
                                          data_train[features], data_train[target],
                                          data_val[features], data_val[target])
print('model 1 F1 score on val dataset: ', model_1_f1_score)
#print('model 1 ROC score on validation-set: ', model_1_roc_score)

model_2, model2_roc_score, model_2_f1_score = _tunning_model(model_2, 
                                          data_train[features], data_train[target],
                                          data_val[features], data_val[target])
print('model 2 F1 score on val dataset: ', model_2_f1_score)
#print('model 2 ROC score on validation-set: ', model_2_roc_score)


model_3, model3_roc_score, model_3_f1_score = _tunning_model(model_3, 
                                          data_train[features], data_train[target],
                                          data_val[features], data_val[target])
print('model 3 F1 score on val dataset: ', model_3_f1_score)
#print('model 3 ROC score on validation-set: ', model_3_roc_score)

model 1 F1 score on val dataset:  0.7813620071684588
model 2 F1 score on val dataset:  0.5405405405405406
model 3 F1 score on val dataset:  0.5629139072847682


In [17]:

#3.1 Creating the best-selected model using Random Forest Classifier algorithm
from sklearn.ensemble import RandomForestClassifier
RFC_model = RandomForestClassifier(n_estimators=100,
                                max_depth=5,
                                min_samples_split=200,
                                class_weight=None,
                                max_features=10)
_algo = 'RFC'


In [20]:

# Training & testing the model on the imbanlance training-set.
_balance ='_None_'
_ratio = '*'
new_row = pd.DataFrame([_train_and_test(RFC_model, data_train, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row], ignore_index=True)


RFC_None_* Accuracy score on test: 0.9990491495915361
RFC_None_* Precision score on test: 0.9900990099009901
RFC_None_* ROC score on test: 0.9999776549545992
RFC_None_* F1 score on test: 0.7380073800738007
RFC_None_* Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74500
           1       0.99      0.59      0.74       170

    accuracy                           1.00     74670
   macro avg       0.99      0.79      0.87     74670
weighted avg       1.00      1.00      1.00     74670



In [22]:
# Training & testing the model on training-set with different ratio of Undersampling balancing method.
_balance = '_unsam_'

# First ratio: 80_20
_ratio = '80_20'
new_row_80_20 = pd.DataFrame([_train_and_test(RFC_model, data_train_80_20, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_80_20], ignore_index=True)

# Second ratio: 75_25
_ratio = '75_25'
new_row_75_25 = pd.DataFrame([_train_and_test(RFC_model, data_train_75_25, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_75_25], ignore_index=True)

# Third ratio: 60_40
_ratio = '60_40'
new_row_60_40 = pd.DataFrame([_train_and_test(RFC_model, data_train_60_40, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_60_40], ignore_index=True)


RFC_unsam_80_20 Accuracy score on test: 0.9948573724387304
RFC_unsam_80_20 Precision score on test: 0.30685920577617326
RFC_unsam_80_20 ROC score on test: 0.9987393604421634
RFC_unsam_80_20 F1 score on test: 0.4696132596685083
RFC_unsam_80_20 Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     74500
           1       0.31      1.00      0.47       170

    accuracy                           0.99     74670
   macro avg       0.65      1.00      0.73     74670
weighted avg       1.00      0.99      1.00     74670

RFC_unsam_75_25 Accuracy score on test: 0.9949377259943752
RFC_unsam_75_25 Precision score on test: 0.3102189781021898
RFC_unsam_75_25 ROC score on test: 0.9985388866956177
RFC_unsam_75_25 F1 score on test: 0.4735376044568245
RFC_unsam_75_25 Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     74500
           1       0.31      1.

In [28]:
# Training & testing the model on training-set with different Oversampling balancing method.
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import (RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN)

oversam = {0: 'RandomOverSampler',
           1: 'SMOTE',
           2: 'BorderlineSMOTE',
           3: 'SVMSMOTE',
           4: 'ADASYN'}
_balance = '_oversam_'

for i, sampler in enumerate((RandomOverSampler(sampling_strategy=1, random_state=0), 
                             SMOTE(sampling_strategy=1, random_state=0),
                             BorderlineSMOTE(sampling_strategy=1, random_state=0, kind='borderline-1'),
                             SVMSMOTE(sampling_strategy=1, random_state=0),
                             ADASYN(sampling_strategy=1, random_state=0))):
    pipe_line = make_pipeline(sampler, RFC_model)
    _ratio = oversam[i]
    
    # Convert the result of _train_and_test to DataFrame
    new_row = pd.DataFrame([_train_and_test(pipe_line, data_train, _algo + _balance + _ratio)])
    
    # Concatenate the new row with the results DataFrame
    rs = pd.concat([rs, new_row], ignore_index=True)

# Save the results to a CSV file
rs.to_csv('Class_BrainClass_RFC_rs.csv', header=True, sep=';', decimal=',')


RFC_oversam_RandomOverSampler Accuracy score on test: 0.995486808624615
RFC_oversam_RandomOverSampler Precision score on test: 0.33530571992110453
RFC_oversam_RandomOverSampler ROC score on test: 0.9999215159889459
RFC_oversam_RandomOverSampler F1 score on test: 0.5022156573116692
RFC_oversam_RandomOverSampler Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74500
           1       0.34      1.00      0.50       170

    accuracy                           1.00     74670
   macro avg       0.67      1.00      0.75     74670
weighted avg       1.00      1.00      1.00     74670

RFC_oversam_SMOTE Accuracy score on test: 0.9954198473282443
RFC_oversam_SMOTE Precision score on test: 0.33203125
RFC_oversam_SMOTE ROC score on test: 0.999206316620608
RFC_oversam_SMOTE F1 score on test: 0.49853372434017595
RFC_oversam_SMOTE Classification Report: 
              precision    recall  f1-score   support

           0  

In [30]:

#3.2 Creating the best-selected model using Light Gradient Boosting algorithm
import lightgbm as lgb

LGB_model = lgb.LGBMClassifier(n_estimator = 800, 
                                    objective = 'binary', 
                                    class_weight = 'balanced',
                                    learning_rate = 0.05,
                                    reg_alpha = 0.1,
                                    reg_lambda = 0.1,
                                    subsample = 0.8,
                                    n_job = -1,
                                    random_state = 12
                                   )
_algo = 'LGB'


In [32]:
# Training & testing the model on the imbalance training-set.
_balance = '_None_'
_ratio = '*'

# Convert the result of _train_and_test to a DataFrame before concatenation
new_row = pd.DataFrame([_train_and_test(LGB_model, data_train, _algo + _balance + _ratio)])

# Concatenate the new row with the existing DataFrame rs
rs = pd.concat([rs, new_row], ignore_index=True)


[LightGBM] [Info] Number of positive: 500, number of negative: 223500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3179
[LightGBM] [Info] Number of data points in the train set: 224000, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LGB_None_* Accuracy score on test: 0.9999062541850811
LGB_None_* Precision score on test: 0.9657142857142857
LGB_None_* ROC score on test: 0.9999995262534545
LGB_None_* F1 score on test: 0.9797101449275363
LGB_None_* Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74500
           1       0.97      0.99      0.98       170

    accuracy                           

In [33]:
# Training & testing the model on training-set with different ratio of Undersampling balancing method.
_balance = '_unsam_'

# First ratio: 80_20
_ratio = '80_20'
new_row_80_20 = pd.DataFrame([_train_and_test(LGB_model, data_train_80_20, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_80_20], ignore_index=True)

# Second ratio: 75_25
_ratio = '75_25'
new_row_75_25 = pd.DataFrame([_train_and_test(LGB_model, data_train_75_25, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_75_25], ignore_index=True)

# Third ratio: 60_40
_ratio = '60_40'
new_row_60_40 = pd.DataFrame([_train_and_test(LGB_model, data_train_60_40, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_60_40], ignore_index=True)


[LightGBM] [Info] Number of positive: 500, number of negative: 2000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1661
[LightGBM] [Info] Number of data points in the train set: 2500, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LGB_unsam_80_20 Accuracy score on test: 0.998660774072586
LGB_unsam_80_20 Precision score on test: 0.6296296296296297
LGB_unsam_80_20 ROC score on test: 0.999997236478484
LGB_unsam_80_20 F1 score on test: 0.7727272727272727
LGB_unsam_80_20 Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74500
           1       0.63      1.00      0.77       170

    accuracy                           1.00     74670
   macro avg       0.81      1.00      0.89     74670
weighted avg       1.00      

In [35]:
# Training & testing the model on training-set with different Oversampling balancing method.
_balance = '_oversam_'
for i, sampler in enumerate((
    RandomOverSampler(sampling_strategy=1, random_state=0), 
    SMOTE(sampling_strategy=1, random_state=0),
    BorderlineSMOTE(sampling_strategy=1, random_state=0, kind='borderline-1'),
    SVMSMOTE(sampling_strategy=1, random_state=0),
    ADASYN(sampling_strategy=1, random_state=0)
)):
    pipe_line = make_pipeline(sampler, LGB_model)
    _ratio = oversam[i]
    
    # Convert the result of _train_and_test to DataFrame before concatenation
    new_row = pd.DataFrame([_train_and_test(pipe_line, data_train, _algo + _balance + _ratio)])
    
    # Concatenate the new row with the existing DataFrame rs
    rs = pd.concat([rs, new_row], ignore_index=True)


[LightGBM] [Info] Number of positive: 223500, number of negative: 223500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2804
[LightGBM] [Info] Number of data points in the train set: 447000, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LGB_oversam_RandomOverSampler Accuracy score on test: 0.9999732154814517
LGB_oversam_RandomOverSampler Precision score on test: 0.9883720930232558
LGB_oversam_RandomOverSampler ROC score on test: 0.9999999999999999
LGB_oversam_RandomOverSampler F1 score on test: 0.9941520467836257
LGB_oversam_RandomOverSampler Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74500
           1       0.99      1.00      0.99      

In [36]:

#3.3 Creating the best-selected model using KNeighbors Classifier algorithm
import lightgbm as lgb

from sklearn.neighbors import KNeighborsClassifier

KNN_model = KNeighborsClassifier(n_neighbors = 5, 
                                      weights = 'distance',
                                      algorithm = 'kd_tree',
                                      metric = 'minkowski'
                                      )
_algo = 'KNN'


In [None]:
# Training & testing the model on the imbalance training-set.
_balance = '_None_'
_ratio = '*'

# Convert the result of _train_and_test to DataFrame before concatenation
new_row = pd.DataFrame([_train_and_test(KNN_model, data_train, _algo + _balance + _ratio)])

# Concatenate the new row with the existing DataFrame rs
rs = pd.concat([rs, new_row], ignore_index=True)


In [None]:
# Training & testing the model on training-set with different ratio of Undersampling balancing method.
_balance = '_unsam_'

# First ratio: 80_20
_ratio = '80_20'
new_row_80_20 = pd.DataFrame([_train_and_test(KNN_model, data_train_80_20, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_80_20], ignore_index=True)

# Second ratio: 75_25
_ratio = '75_25'
new_row_75_25 = pd.DataFrame([_train_and_test(KNN_model, data_train_75_25, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_75_25], ignore_index=True)

# Third ratio: 60_40
_ratio = '60_40'
new_row_60_40 = pd.DataFrame([_train_and_test(KNN_model, data_train_60_40, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_60_40], ignore_index=True)


In [None]:

# Training & testing the model on training-set with differrent Oversampling balancing method.
_balance ='_oversam_'
for i, sampler in enumerate((RandomOverSampler(sampling_strategy = 1, random_state=0), 
                             SMOTE(sampling_strategy = 1, random_state=0),
                             BorderlineSMOTE(sampling_strategy = 1, random_state=0, kind='borderline-1'),
                             SVMSMOTE(sampling_strategy = 1, random_state=0),
                             ADASYN(sampling_strategy = 1, random_state=0))):
  pipe_line = make_pipeline(sampler, KNN_model)
  _ratio = oversam[i]
  rs = rs.append(_train_and_test(pipe_line, data_train, _algo + _balance + _ratio),ignore_index=True)
rs.to_csv('Class_BrainClass_KNN_rs.csv', header=True, sep=';', decimal=',') 


In [19]:

#3.4 Creating the best-selected model using Linear Logistic Regression algorithm
from sklearn.linear_model import LogisticRegression
LLR_model = LogisticRegression(C = 0.0001)
_algo = 'LLR'


In [20]:
# Training & testing the model on the imbalance training-set.
_balance = '_None_'
_ratio = '*'

# Convert the result of _train_and_test to DataFrame before concatenation
new_row = pd.DataFrame([_train_and_test(LLR_model, data_train, _algo + _balance + _ratio)])

# Concatenate the new row with the existing DataFrame rs
rs = pd.concat([rs, new_row], ignore_index=True)


LLR_None_* Accuracy score on test: 0.9975358242935584
LLR_None_* Precision score on test: 0.28125
LLR_None_* ROC score on test: 0.4006306356099487
LLR_None_* F1 score on test: 0.0891089108910891
LLR_None_* Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74500
           1       0.28      0.05      0.09       170

    accuracy                           1.00     74670
   macro avg       0.64      0.53      0.54     74670
weighted avg       1.00      1.00      1.00     74670



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Training & testing the model on training-set with different ratio of Undersampling balancing method.
_balance = '_unsam_'

# First ratio: 80_20
_ratio = '80_20'
new_row_80_20 = pd.DataFrame([_train_and_test(LLR_model, data_train_80_20, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_80_20], ignore_index=True)

# Second ratio: 75_25
_ratio = '75_25'
new_row_75_25 = pd.DataFrame([_train_and_test(LLR_model, data_train_75_25, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_75_25], ignore_index=True)

# Third ratio: 60_40
_ratio = '60_40'
new_row_60_40 = pd.DataFrame([_train_and_test(LLR_model, data_train_60_40, _algo + _balance + _ratio)])
rs = pd.concat([rs, new_row_60_40], ignore_index=True)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LLR_unsam_80_20 Accuracy score on test: 0.9770188830855765
LLR_unsam_80_20 Precision score on test: 0.08663101604278074
LLR_unsam_80_20 ROC score on test: 0.9924127516778524
LLR_unsam_80_20 F1 score on test: 0.1588235294117647
LLR_unsam_80_20 Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     74500
           1       0.09      0.95      0.16       170

    accuracy                           0.98     74670
   macro avg       0.54      0.97      0.57     74670
weighted avg       1.00      0.98      0.99     74670

LLR_unsam_75_25 Accuracy score on test: 0.9686755055577876
LLR_unsam_75_25 Precision score on test: 0.06533066132264528
LLR_unsam_75_25 ROC score on test: 0.9765120015791552
LLR_unsam_75_25 F1 score on test: 0.12232645403377111
LLR_unsam_75_25 Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     74500
           1       0.07      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Training & testing the model on training-set with different Oversampling balancing method.
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import (RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN)

oversam = {0: 'RandomOverSampler',
           1: 'SMOTE',
           2: 'BorderlineSMOTE',
           3: 'SVMSMOTE',
           4: 'ADASYN'}

_balance = '_oversam_'

for i, sampler in enumerate((RandomOverSampler(sampling_strategy=1, random_state=0), 
                             SMOTE(sampling_strategy=1, random_state=0),
                             BorderlineSMOTE(sampling_strategy=1, random_state=0, kind='borderline-1'),
                             SVMSMOTE(sampling_strategy=1, random_state=0),
                             ADASYN(sampling_strategy=1, random_state=0))):
  
    pipe_line = make_pipeline(sampler, LLR_model)
    _ratio = oversam[i]
    
    # Create a DataFrame from the results of _train_and_test
    new_row = pd.DataFrame([_train_and_test(pipe_line, data_train, _algo + _balance + _ratio)])
    
    # Concatenate the new row with the existing DataFrame rs
    rs = pd.concat([rs, new_row], ignore_index=True)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LLR_oversam_RandomOverSampler Accuracy score on test: 0.9685147984464979
LLR_oversam_RandomOverSampler Precision score on test: 0.06605650616792678
LLR_oversam_RandomOverSampler ROC score on test: 0.9804503355704697
LLR_oversam_RandomOverSampler F1 score on test: 0.12374207976146105
LLR_oversam_RandomOverSampler Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     74500
           1       0.07      0.98      0.12       170

    accuracy                           0.97     74670
   macro avg       0.53      0.97      0.55     74670
weighted avg       1.00      0.97      0.98     74670



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LLR_oversam_SMOTE Accuracy score on test: 0.9726931833400294
LLR_oversam_SMOTE Precision score on test: 0.07231121281464531
LLR_oversam_SMOTE ROC score on test: 0.9571607185155943
LLR_oversam_SMOTE F1 score on test: 0.13418259023354565
LLR_oversam_SMOTE Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     74500
           1       0.07      0.93      0.13       170

    accuracy                           0.97     74670
   macro avg       0.54      0.95      0.56     74670
weighted avg       1.00      0.97      0.98     74670



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LLR_oversam_BorderlineSMOTE Accuracy score on test: 0.9780500870496853
LLR_oversam_BorderlineSMOTE Precision score on test: 0.0889759373251259
LLR_oversam_BorderlineSMOTE ROC score on test: 0.9607372285827083
LLR_oversam_BorderlineSMOTE F1 score on test: 0.16249361267245785
LLR_oversam_BorderlineSMOTE Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     74500
           1       0.09      0.94      0.16       170

    accuracy                           0.98     74670
   macro avg       0.54      0.96      0.58     74670
weighted avg       1.00      0.98      0.99     74670



In [22]:

#3.5 Creating the best-selected model using Linear Support Vector Classification algorithm
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

SVM_model = LinearSVC(penalty='l2', 
                           loss='squared_hinge',
                           tol=0.0001,
                           C=0.9,
                           dual=False,
                           class_weight='balanced',
                           max_iter=1000
                          )
SVM_model = CalibratedClassifierCV(SVM_model) 
_algo = 'SVM'

In [None]:
# Training & testing the model on the imbalance training-set.
_balance = '_None_'
_ratio = '*'

# Convert the result of _train_and_test to DataFrame before concatenation
new_row = pd.DataFrame([_train_and_test(SVM_model, data_train, _algo + _balance + _ratio)])

# Concatenate the new row with the existing DataFrame rs
rs = pd.concat([rs, new_row], ignore_index=True)


In [None]:
# Assuming previous imports and function definitions are already in place

# Initialize an empty DataFrame or ensure 'rs' is defined
# Example:
rs = pd.DataFrame(columns=['Code_smell', 'Algo', 'Balance', 'Ratio', 'Accuracy', 'Precision', 'F1_score', 'AUC'])

# Training & testing the model on training-set with different ratios of Undersampling balancing method
_balance = '_unsam_'

# List of ratios to use for undersampling
ratios = ['80_20', '75_25', '60_40']

for _ratio in ratios:
    # Call _train_and_test for each undersampling ratio
    new_row = pd.DataFrame([_train_and_test(SVM_model, eval(f'data_train_{_ratio.replace("_", "_")}', ), _algo + _balance + _ratio)])
    
    # Concatenate the new row with the existing DataFrame rs
    rs = pd.concat([rs, new_row], ignore_index=True)


In [None]:
# Assuming previous imports and function definitions are already in place

# Initialize an empty DataFrame or ensure 'rs' is defined
# Example:
rs = pd.DataFrame(columns=['Code_smell', 'Algo', 'Balance', 'Ratio', 'Accuracy', 'Precision', 'F1_score', 'AUC'])

# Training & testing the model on training-set with different Oversampling balancing methods
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import (RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN)

# Dictionary mapping index to oversampling method names
oversam = {
    0: 'RandomOverSampler',
    1: 'SMOTE',
    2: 'BorderlineSMOTE',
    3: 'SVMSMOTE',
    4: 'ADASYN'
}

_balance = '_oversam_'

# Iterate through each oversampling method
for i, sampler in enumerate((RandomOverSampler(sampling_strategy=1, random_state=0),
                             SMOTE(sampling_strategy=1, random_state=0),
                             BorderlineSMOTE(sampling_strategy=1, random_state=0, kind='borderline-1'),
                             SVMSMOTE(sampling_strategy=1, random_state=0),
                             ADASYN(sampling_strategy=1, random_state=0))):
    
    # Create a pipeline with the sampler and SVM model
    pipe_line = make_pipeline(sampler, SVM_model)
    _ratio = oversam[i]

    # Call _train_and_test and convert results to DataFrame
    new_row = pd.DataFrame([_train_and_test(pipe_line, data_train, _algo + _balance + _ratio)])

    # Concatenate the new row with the existing DataFrame rs
    rs = pd.concat([rs, new_row], ignore_index=True)

# Save the results to a CSV file
rs.to_csv('Class_BrainClass_SVM_rs.csv', header=True, sep=';', decimal=',', index=False)
