# Machine Learning Model

## Import of libs

In [1]:
# data_management
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Models and scores
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

## Upload data

In [2]:
file_path_1 = '../data/test_clean.csv'

In [3]:
file_path_2 = '../data/train_clean_balanced.csv'

In [4]:
train = pd.read_csv(file_path_2, encoding='utf-8', engine='python')
train

Unnamed: 0,text,label
0,coordinator summary first job retail salespers...,HR
1,financial institution examiner summary well re...,BANKING
2,support officer executive profile seeking assi...,BPO
3,finance manager summary pro active results ori...,FINANCE
4,manager summary human resource manager practic...,HR
...,...,...
2130,business account lead executive profile strong...,BPO
2131,team lead senior analyst professional summary ...,BPO
2132,digital medium service content distribution pr...,DIGITAL-MEDIA
2133,sale representative highlight business tool sa...,DIGITAL-MEDIA


In [5]:
test = pd.read_csv(file_path_1, encoding='utf-8', engine='python')
test

Unnamed: 0,text,label
0,designer summary get strong foothold career la...,DESIGNER
1,digital marketing director summary background ...,DIGITAL-MEDIA
2,laboer floor construction worker round experie...,CONSTRUCTION
3,medical record technician professional summary...,HEALTHCARE
4,construction manager project coordinator inspe...,CONSTRUCTION
...,...,...
492,engineering intern summary looking opportunity...,ENGINEERING
493,professional fitness trainer group instructor ...,FITNESS
494,software support specialist professional summa...,AUTOMOBILE
495,coordinator summary certified human resource p...,HR


In [6]:
def sanitize(df):
    df = df.copy()
    if not {'text','label'}.issubset(df.columns):
        raise ValueError(f'Faltan columnas: ', {{'text','label'} - set(df.columns)})
    df = df.dropna(subset=['label'])
    df['text'] = df['text'].fillna('').astype(str).str.strip()
    df = df[df['text'].str.len() > 0]
    return df

In [7]:
train = sanitize(train)
test  = sanitize(test)

In [8]:
assert train['text'].isna().sum() == 0 and test['text'].isna().sum() == 0

In [9]:
X_train, y_train  = train['text'], train['label']

In [10]:
X_test, y_test = test['text'], test['label']

## Vectorize data

In [11]:
vec_1 = CountVectorizer(ngram_range=(1,2), lowercase=False)
Xtr_1 = vec_1.fit_transform(X_train) 
Xte_1 = vec_1.transform(X_test)

In [12]:
vec_2 = TfidfVectorizer(ngram_range=(1,2), lowercase=False)
Xtr_2 = vec_2.fit_transform(X_train) 
Xte_2 = vec_2.transform(X_test)

## Models

In [13]:
def resume_gs(gs, nombre, Xte, y_test):
    print(f'\n=== {nombre} | Best combination ===')
    print(gs.best_params_)
    best_idx = gs.best_index_
    mean_f1 = gs.cv_results_['mean_test_f1_macro'][best_idx]
    mean_acc = gs.cv_results_['mean_test_accuracy'][best_idx]
    print(f'f1_macro (CV): {mean_f1:.4f}')
    print(f'accuracy (CV): {mean_acc:.4f}')

    y_pred = gs.best_estimator_.predict(Xte)
    print('\nReport on test:')
    print(classification_report(y_test, y_pred))
    return {
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_,
        'cv_f1_macro': float(mean_f1),
        'cv_accuracy': float(mean_acc)
    }

### Random Forest

In [14]:
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'max_features': ['log2', 0.5],
    'criterion': ['log_loss', 'entropy']
}

gs_rf_count = GridSearchCV(
    rf, param_grid_rf,
    scoring={'accuracy': 'accuracy', 'f1_macro': 'f1_macro'},
    refit='f1_macro'
)
gs_rf_count.fit(Xtr_1, y_train)
res_rf_count = resume_gs(gs_rf_count, 'Random Forest (Count)', Xte_1, y_test)

gs_rf_tfidf = GridSearchCV(
    rf, param_grid_rf,
    scoring={'accuracy': 'accuracy', 'f1_macro': 'f1_macro'},
    refit='f1_macro'
)
gs_rf_tfidf.fit(Xtr_2, y_train)
res_rf_tfidf = resume_gs(gs_rf_tfidf, 'Random Forest (TF-IDF)', Xte_2, y_test)


=== Random Forest (Count) | Best combination ===
{'criterion': 'log_loss', 'max_features': 0.5}
f1_macro (CV): 0.7230
accuracy (CV): 0.7367

Report on test:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.69      1.00      0.81        24
              ADVOCATE       0.81      0.54      0.65        24
           AGRICULTURE       0.40      0.31      0.35        13
               APPAREL       0.62      0.26      0.37        19
                  ARTS       0.60      0.43      0.50        21
            AUTOMOBILE       0.50      0.14      0.22         7
              AVIATION       0.83      0.79      0.81        24
               BANKING       0.92      0.48      0.63        23
                   BPO       1.00      0.25      0.40         4
  BUSINESS-DEVELOPMENT       0.63      1.00      0.77        24
                  CHEF       0.87      0.83      0.85        24
          CONSTRUCTION       0.88      0.95      0.91        22
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
