In [1]:
import pandas as pd
import joblib
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

warnings.filterwarnings('ignore')

In [2]:
def create_model(codesmell):
    codesmell_metrics_path = f'data/dataset/{codesmell}/{codesmell}_metrics.csv'

    df = pd.read_csv(codesmell_metrics_path)
    X = df.drop(columns=['label'])
    y = df['label']

    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

    preprocessing_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    X[numerical_cols] = preprocessing_pipeline.fit_transform(X[numerical_cols])
    
    k_fisher = int(0.6 * X.shape[1])
    selector = SelectKBest(score_func=f_classif, k=k_fisher)
    X_fisher_selected = selector.fit_transform(X, y)
    fisher_selected_features = X.columns[selector.get_support()].tolist()

    X_train, X_test, y_train, y_test = train_test_split(X[fisher_selected_features], y, test_size=0.3, random_state=42, stratify=y)

    sfs_model = LogisticRegression(random_state=42)
    sfs = SequentialFeatureSelector(sfs_model, k_features='best', forward=True, floating=False, scoring='accuracy', cv=2)
    sfs.fit(X_train, y_train)
    final_selected_features = list(sfs.k_feature_names_)
    
    final_model = RandomForestClassifier(random_state=42, n_estimators=300)
    final_model.fit(X_train[final_selected_features], y_train)

    output_folder = f'output/{codesmell}'
    os.makedirs(output_folder, exist_ok=True)
    joblib.dump(preprocessing_pipeline, os.path.join(output_folder, 'preprocessing_pipeline.joblib'))
    joblib.dump(final_selected_features, os.path.join(output_folder, 'selected_features.joblib'))
    joblib.dump(final_model, os.path.join(output_folder, 'final_model.joblib'))

    print("Training Complete. Model and pipeline saved.")
    
    preprocessing_pipeline = joblib.load(os.path.join(output_folder, 'preprocessing_pipeline.joblib'))
    selected_features = joblib.load(os.path.join(output_folder, 'selected_features.joblib'))
    final_model = joblib.load(os.path.join(output_folder, 'final_model.joblib'))

    test_data = pd.read_csv(codesmell_metrics_path)
    X_test = test_data.drop(columns=['label'])
    y_test = test_data['label']
    
    numerical_cols = X_test.select_dtypes(include=['number']).columns.tolist()
    X_test[numerical_cols] = preprocessing_pipeline.transform(X_test[numerical_cols])

    X_test_final = X_test[selected_features]
    y_pred = final_model.predict(X_test_final)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    test_aucroc = roc_auc_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)

    print("Performance Metrics:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"AUC-ROC: {test_aucroc:.4f}")
    print(f"F1-Score: {test_f1:.4f}")

In [3]:
create_model('featureenvy')

Training Complete. Model and pipeline saved.
Performance Metrics:
Accuracy: 1.0000
AUC-ROC: 1.0000
F1-Score: 1.0000


In [4]:
create_model('godclass')

Training Complete. Model and pipeline saved.
Performance Metrics:
Accuracy: 0.9613
AUC-ROC: 0.9644
F1-Score: 0.9653


In [5]:
create_model('longmethod')

Training Complete. Model and pipeline saved.
Performance Metrics:
Accuracy: 0.9987
AUC-ROC: 0.9987
F1-Score: 0.9987


In [6]:
create_model('longparameter')

Training Complete. Model and pipeline saved.
Performance Metrics:
Accuracy: 0.9898
AUC-ROC: 0.9880
F1-Score: 0.9870


In [7]:
create_model('refusedbequest')

Training Complete. Model and pipeline saved.
Performance Metrics:
Accuracy: 1.0000
AUC-ROC: 1.0000
F1-Score: 1.0000


In [8]:
create_model('shotgunsurgery')

Training Complete. Model and pipeline saved.
Performance Metrics:
Accuracy: 1.0000
AUC-ROC: 1.0000
F1-Score: 1.0000
