In [34]:
from collections import Counter

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import RandomOverSampler
from mlxtend.feature_selection import SequentialFeatureSelector
import joblib

import warnings
import os

In [35]:
warnings.filterwarnings('ignore')

In [36]:
df = pd.read_csv('data/dataset/dataclass_metrics.csv')

X = df.drop(columns=['label'])
y = df['label']

numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

In [37]:
imputer = SimpleImputer(strategy='median')
X[numerical_cols] = imputer.fit_transform(X[numerical_cols])

print("Performed NAN Removal by Median Imputation")

Performed NAN Removal by Median Imputation


In [38]:
scaler = RobustScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

print("Performed Scaling using RobustScaler")

Performed Scaling using RobustScaler


In [39]:
print("Class distribution before ROSE:", Counter(y))

rose = RandomOverSampler(random_state=42)
X_resampled, y_resampled = rose.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['label'] = y_resampled

print("Class distribution after ROSE:", Counter(y_resampled))

Class distribution before ROSE: Counter({0: 1875, 1: 284})
Class distribution after ROSE: Counter({1: 1875, 0: 1875})


In [40]:
total_features = X_resampled.shape[1]

k_fisher = int(0.6 * total_features)
selector = SelectKBest(score_func=f_classif, k=k_fisher)
X_fisher_selected = selector.fit_transform(X_resampled, y_resampled)

fisher_selected_indices = selector.get_support(indices=True)
fisher_selected_features = X_resampled.columns[fisher_selected_indices].tolist()

print(f"Top {k_fisher} features out of {total_features} selected by Fisher's Score:")
print(fisher_selected_features)

Top 25 features out of 43 selected by Fisher's Score:
['cbo', 'dit', 'rfc', 'lcom', 'tcc', 'lcc', 'totalMethodsQty', 'staticMethodsQty', 'publicMethodsQty', 'privateMethodsQty', 'defaultMethodsQty', 'abstractMethodsQty', 'finalMethodsQty', 'staticFieldsQty', 'publicFieldsQty', 'protectedFieldsQty', 'defaultFieldsQty', 'nosi', 'returnQty', 'loopQty', 'comparisonsQty', 'numbersQty', 'maxNestedBlocksQty', 'lambdasQty', 'logStatementsQty']


In [41]:
model = LogisticRegression(solver='liblinear', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_resampled[fisher_selected_features], y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)

sfs = SequentialFeatureSelector(model, k_features='best', forward=True, floating=False, scoring='accuracy', cv=2)
sfs = sfs.fit(X_train, y_train)
final_selected_features = list(sfs.k_feature_names_)
final_number_features = len(final_selected_features)

print(f"Final {final_number_features} Features out of {total_features} selected after Sequential Forward Selection:")
print(final_selected_features)

Final 13 Features out of 43 selected after Sequential Forward Selection:
['cbo', 'rfc', 'totalMethodsQty', 'staticMethodsQty', 'publicMethodsQty', 'privateMethodsQty', 'finalMethodsQty', 'staticFieldsQty', 'publicFieldsQty', 'nosi', 'loopQty', 'numbersQty', 'lambdasQty']


In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[final_selected_features])
X_test_scaled = scaler.transform(X_test[final_selected_features])

print("Performed Scaling using StandardScaler")

Performed Scaling using StandardScaler


In [43]:
n_components = len(final_selected_features)  
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(n_components)])
X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(n_components)])

print("Performed Feature transformation using PCA")

Performed Feature transformation using PCA


In [44]:
final_model = RandomForestClassifier(random_state=42, n_estimators=300)

final_model.fit(X_train_pca, y_train)
y_pred_final = final_model.predict(X_test_pca)

print("Prediction by Random Forest")

Prediction by Random Forest


In [45]:
accuracy = accuracy_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='weighted')
precision = precision_score(y_test, y_pred_final, average='weighted')
recall = recall_score(y_test, y_pred_final, average='weighted')

print("Performance Metrics with selected features:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Performance Metrics with selected features:
Accuracy: 0.9760
F1-Score: 0.9760
Precision: 0.9771
Recall: 0.9760


In [46]:
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('robust_scaler', RobustScaler()),
    ('standard_scaler', StandardScaler()),
    ('pca', PCA(n_components=len(final_selected_features), random_state=42))
])

preprocessing_pipeline.fit(X_train[final_selected_features], y_train)

os.makedirs('output', exist_ok=True)

pipeline_filename = 'output/code_smell_preprocessing_pipeline.joblib'
joblib.dump(preprocessing_pipeline, pipeline_filename)

model_filename = 'output/code_smell_detection_model.joblib'
joblib.dump(final_model, model_filename)

features_filename = 'output/selected_features.joblib'
joblib.dump(final_selected_features, features_filename)

print(f"\nPreprocessing pipeline saved as: {pipeline_filename}")
print(f"Trained final model saved as: {model_filename}")
print(f"List of selected features saved as: {features_filename}")



Preprocessing pipeline saved as: output/code_smell_preprocessing_pipeline.joblib
Trained final model saved as: output/code_smell_detection_model.joblib
List of selected features saved as: output/selected_features.joblib
