# Machine Learning Model

## Import of libs

In [None]:
# data_management
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Models and scores
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

## Upload data

In [107]:
file_path_1 = '../data/test_clean.csv'

In [108]:
file_path_2 = '../data/train_clean_balanced.csv'

In [109]:
train = pd.read_csv(file_path_2, encoding="utf-8", engine='python')
train

Unnamed: 0,text,label
0,coordinator summary first job retail salespers...,HR
1,financial institution examiner summary well re...,BANKING
2,support officer executive profile seeking assi...,BPO
3,finance manager summary pro active results ori...,FINANCE
4,manager summary human resource manager practic...,HR
...,...,...
2130,business account lead executive profile strong...,BPO
2131,team lead senior analyst professional summary ...,BPO
2132,digital medium service content distribution pr...,DIGITAL-MEDIA
2133,sale representative highlight business tool sa...,DIGITAL-MEDIA


In [110]:
test = pd.read_csv(file_path_1, encoding="utf-8", engine='python')
test

Unnamed: 0,text,label
0,designer summary get strong foothold career la...,DESIGNER
1,digital marketing director summary background ...,DIGITAL-MEDIA
2,laboer floor construction worker round experie...,CONSTRUCTION
3,medical record technician professional summary...,HEALTHCARE
4,construction manager project coordinator inspe...,CONSTRUCTION
...,...,...
492,engineering intern summary looking opportunity...,ENGINEERING
493,professional fitness trainer group instructor ...,FITNESS
494,software support specialist professional summa...,AUTOMOBILE
495,coordinator summary certified human resource p...,HR


In [111]:
def sanitize(df):
    df = df.copy()
    if not {'text','label'}.issubset(df.columns):
        raise ValueError(f"Faltan columnas: { {'text','label'} - set(df.columns) }")
    df = df.dropna(subset=['label'])
    df['text'] = df['text'].fillna('').astype(str).str.strip()
    df = df[df['text'].str.len() > 0]
    return df

In [112]:
train = sanitize(train)
test  = sanitize(test)

In [113]:
assert train['text'].isna().sum() == 0 and test['text'].isna().sum() == 0

In [114]:
X_train, y_train  = train['text'], train['label']

In [115]:
X_test, y_test = test['text'], test['label']

## Vectorize data

In [117]:
vec_1 = CountVectorizer(ngram_range=(1,2), lowercase=False)
Xtr_1 = vec_1.fit_transform(X_train) 
Xte_1 = vec_1.transform(X_test)

In [118]:
vec_2 = TfidfVectorizer(ngram_range=(1,2), lowercase=False)
Xtr_2 = vec_2.fit_transform(X_train) 
Xte_2 = vec_2.transform(X_test)

## Models