# Processing

## Importation of libs

In [3]:
# data_management
import pandas as pd
import numpy as np
import chardet
import re, unicodedata
from collections import Counter

# NLTK resources
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download("stopwords")
nltk.download('wordnet')

# NLP
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Split of data
from sklearn.model_selection import train_test_split

# Classes Balancing
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


[nltk_data] Downloading package punkt to /home/trodriten/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/trodriten/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/trodriten/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/trodriten/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Upload data

In [4]:
file_path = '../data/Resume.csv'
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read(100000))
encoding = result['encoding']
encoding

'utf-8'

In [5]:
data = pd.read_csv(file_path, encoding=encoding, engine='python')
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


## Useful functions

In [6]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        if word is not None:
          new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
          new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
      n1word = word.lower()
      new_words.append(n1word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        if word is not None:
            new_word = re.sub(r'[^\w\s\+\#\.\-\/&]', '', word)
            if new_word != '':
                new_words.append(new_word)
    return new_words


def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    stop_words = set(stopwords.words('spanish')) | set(stopwords.words('english'))
    new_words = [word for word in words if word not in stop_words]
    return new_words

def preprocessing(words):
    words = remove_punctuation(words)
    words = to_lowercase(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

def preprocess_text(s):
    if not isinstance(s, str):
        s = ""
    tokens = word_tokenize(s)      
    tokens = preprocessing(tokens)
    return " ".join(tokens) 

## Columns selection

In [7]:
X_raw = data['Resume_str']
X_raw

0                HR ADMINISTRATOR/MARKETING ASSOCIATE\...
1                HR SPECIALIST, US HR OPERATIONS      ...
2                HR DIRECTOR       Summary      Over 2...
3                HR SPECIALIST       Summary    Dedica...
4                HR MANAGER         Skill Highlights  ...
                              ...                        
2479             RANK: SGT/E-5 NON- COMMISSIONED OFFIC...
2480             GOVERNMENT RELATIONS, COMMUNICATIONS ...
2481             GEEK SQUAD AGENT         Professional...
2482             PROGRAM DIRECTOR / OFFICE MANAGER    ...
2483             STOREKEEPER II       Professional Sum...
Name: Resume_str, Length: 2484, dtype: object

In [8]:
y = data['Category']
y

0             HR
1             HR
2             HR
3             HR
4             HR
          ...   
2479    AVIATION
2480    AVIATION
2481    AVIATION
2482    AVIATION
2483    AVIATION
Name: Category, Length: 2484, dtype: object

## Split of the data

In [9]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, stratify=y, random_state=42, test_size=0.2)

In [10]:
X_train_raw.shape

(1987,)

In [11]:
X_test_raw.shape

(497,)

In [12]:
y_train.shape

(1987,)

In [13]:
y_test.shape

(497,)

## Data cleaning

In [14]:
X_train_clean = X_train_raw.apply(preprocess_text)
X_test_clean = X_test_raw.apply(preprocess_text)

## Training data balancing

In [15]:
def cap_boost_resample(X_text, y, under_cap=100, target_min=80, random_state=42):
    cnt = Counter(y)
    under_strategy = {c: under_cap for c, n in cnt.items() if n > under_cap}
    over_strategy  = {c: target_min for c, n in cnt.items() if n < target_min}

    X_arr = np.array(list(X_text), dtype=object).reshape(-1, 1)

    if under_strategy:
        rus = RandomUnderSampler(sampling_strategy=under_strategy, random_state=random_state)
        X_u, y_u = rus.fit_resample(X_arr, y)
    else:
        X_u, y_u = X_arr, y

    cnt_u = Counter(y_u)
    over_strategy = {c: target_min for c, n in cnt_u.items() if n < target_min}
    if over_strategy:
        ros = RandomOverSampler(sampling_strategy=over_strategy, random_state=random_state)
        X_b, y_b = ros.fit_resample(X_u, y_u)
    else:
        X_b, y_b = X_u, y_u

    X_b = X_b.ravel().tolist()
    return X_b, y_b

In [16]:
X_train_bal, y_train_bal = cap_boost_resample(X_train_clean, y_train, under_cap=100, target_min=80)

In [17]:
len(X_train_bal)

2135

In [18]:
len(y_train_bal)

2135

## Data saving

In [20]:
df_train_bal = pd.DataFrame({
    "text": pd.Series(X_train_bal, dtype="object"),
    "label": pd.Series(y_train_bal, dtype="object")
})
df_train_bal.to_csv("../data/train_clean_balanced.csv", index=False, encoding="utf-8")

In [21]:
df_test_clean = pd.DataFrame({
    "text": pd.Series(X_test_clean, dtype="object"),
    "label": pd.Series(y_test, dtype="object")
})
df_test_clean.to_csv("../data/test_clean.csv", index=False, encoding="utf-8")
