In [16]:
import os

import joblib
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC, LinearSVC

from src.dataset_factory import DatasetFactory
from src.experiment_runner import ExperimentRunner

In [17]:

factory = DatasetFactory()
df = factory.create_ml_dataset('master_ml')


Generating ML dataset for experiment: master_ml...
Dataset ML guardado en data/experiments/master_ml/dataset_ml.csv


In [None]:
print(df['journal'].value_counts())

journal
3 Expert Systems with Applications    10242
5 Pattern Recognition                  4518
1 Applied Ergonomics                   1125
6 Robotics and Autonomous Systems       842
Name: count, dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], df['journal'], test_size=0.3, stratify=df['journal'], random_state=42
)

runner = ExperimentRunner()
max_features = 3000

In [20]:
rf_pipe = ImbPipeline([
    ('tfidf', TfidfVectorizer(max_features=max_features)),
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier(n_estimators=100))
])
runner.run('RF_Tfidf_Smote', rf_pipe, X_train, X_test, y_train, y_test)


>>> Starting Experiment: RF_Tfidf_Smote
Model saved in: results/experiments\RF_Tfidf_Smote\RF_Tfidf_Smote_model.joblib
Results saved in: results/experiments\RF_Tfidf_Smote


'                                    precision    recall  f1-score   support\n\n              1 Applied Ergonomics       0.96      0.88      0.92       337\n3 Expert Systems with Applications       0.84      0.86      0.85      3073\n             5 Pattern Recognition       0.71      0.69      0.70      1356\n 6 Robotics and Autonomous Systems       0.76      0.79      0.78       253\n\n                          accuracy                           0.81      5019\n                         macro avg       0.82      0.80      0.81      5019\n                      weighted avg       0.81      0.81      0.81      5019\n'

In [21]:
svm_pipe = ImbPipeline([
    ('tfidf', TfidfVectorizer(max_features=max_features)),
    ('smote', SMOTE()),
    ('clf', LinearSVC(class_weight='balanced'))
])
runner.run('SVM_Linear_Smote', svm_pipe, X_train, X_test, y_train, y_test)


>>> Starting Experiment: SVM_Linear_Smote
Model saved in: results/experiments\SVM_Linear_Smote\SVM_Linear_Smote_model.joblib
Results saved in: results/experiments\SVM_Linear_Smote


'                                    precision    recall  f1-score   support\n\n              1 Applied Ergonomics       0.93      0.96      0.94       337\n3 Expert Systems with Applications       0.88      0.82      0.85      3073\n             5 Pattern Recognition       0.68      0.77      0.73      1356\n 6 Robotics and Autonomous Systems       0.71      0.83      0.77       253\n\n                          accuracy                           0.82      5019\n                         macro avg       0.80      0.84      0.82      5019\n                      weighted avg       0.82      0.82      0.82      5019\n'

In [22]:
svm_rbf_pipe = ImbPipeline([
    ('tfidf', TfidfVectorizer(max_features=max_features)),
    ('smote', SMOTE()),
    ('clf', SVC(kernel='rbf', class_weight='balanced'))
])
runner.run('SVM_RBF_Smote', svm_rbf_pipe, X_train, X_test, y_train, y_test)


>>> Starting Experiment: SVM_RBF_Smote
Model saved in: results/experiments\SVM_RBF_Smote\SVM_RBF_Smote_model.joblib
Results saved in: results/experiments\SVM_RBF_Smote


'                                    precision    recall  f1-score   support\n\n              1 Applied Ergonomics       0.97      0.92      0.94       337\n3 Expert Systems with Applications       0.85      0.89      0.87      3073\n             5 Pattern Recognition       0.77      0.72      0.74      1356\n 6 Robotics and Autonomous Systems       0.81      0.74      0.77       253\n\n                          accuracy                           0.84      5019\n                         macro avg       0.85      0.82      0.83      5019\n                      weighted avg       0.84      0.84      0.84      5019\n'

In [23]:
nb_pipe = ImbPipeline([
    ('tfidf', TfidfVectorizer(max_features=max_features)),
    ('smote', SMOTE()),
    ('clf', MultinomialNB())
])

runner.run('Naive_Bayes_Tfidf', nb_pipe, X_train, X_test, y_train, y_test)


>>> Starting Experiment: Naive_Bayes_Tfidf
Model saved in: results/experiments\Naive_Bayes_Tfidf\Naive_Bayes_Tfidf_model.joblib
Results saved in: results/experiments\Naive_Bayes_Tfidf


'                                    precision    recall  f1-score   support\n\n              1 Applied Ergonomics       0.87      0.97      0.92       337\n3 Expert Systems with Applications       0.91      0.71      0.80      3073\n             5 Pattern Recognition       0.62      0.84      0.71      1356\n 6 Robotics and Autonomous Systems       0.54      0.90      0.68       253\n\n                          accuracy                           0.77      5019\n                         macro avg       0.74      0.86      0.78      5019\n                      weighted avg       0.81      0.77      0.78      5019\n'

In [24]:
lr_pipe = ImbPipeline([
    ('tfidf', TfidfVectorizer(max_features=max_features)),
    ('smote', SMOTE()),
    ('clf', LogisticRegression(multi_class='ovr', max_iter=1000))
])

runner.run('Logistic_Regression_Tfidf', lr_pipe, X_train, X_test, y_train, y_test)


>>> Starting Experiment: Logistic_Regression_Tfidf




Model saved in: results/experiments\Logistic_Regression_Tfidf\Logistic_Regression_Tfidf_model.joblib
Results saved in: results/experiments\Logistic_Regression_Tfidf


'                                    precision    recall  f1-score   support\n\n              1 Applied Ergonomics       0.91      0.97      0.94       337\n3 Expert Systems with Applications       0.90      0.81      0.85      3073\n             5 Pattern Recognition       0.70      0.82      0.75      1356\n 6 Robotics and Autonomous Systems       0.68      0.87      0.77       253\n\n                          accuracy                           0.82      5019\n                         macro avg       0.80      0.87      0.83      5019\n                      weighted avg       0.84      0.82      0.83      5019\n'