In [None]:
!pip install googletrans==4.0.0-rc1
!pip install pandas 
!pip install spacy 
!pip install unicodedata 
!pip install nltk
!pip install gensim
!python3 -m spacy download en_core_web_sm

In [None]:
import pandas as pd
from googletrans import Translator
from tqdm import tqdm
import spacy
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import strip_punctuation
import pickle
nltk.download('stopwords')

In [None]:
flag = True

In [None]:
def get_new_df(df):
    translator = Translator()
    D = {
        'Job_offer':[],
        'Label':[],
        'Traslation':[]
    }
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        try:
            traslation = translator.translate(str(row['Job_offer']), dest='en').text
        except:
            continue
        D['Job_offer'].append(row['Job_offer'])
        D['Label'].append(row['Label'])
        D['Traslation'].append(traslation)
    new_df = pd.DataFrame.from_dict(D)
    return new_df

In [None]:
sp = spacy.load('en_core_web_sm',  disable=['parser', 'ner'])

def add_token_col(df):
    D = {
        'Job_offer':[],
        'Label':[],
        'Traslation':[],
        'token':[]
    }
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        tokens = None
        message = row['Traslation']
        # Get lemma
        tokens = [token.lemma_ for token in sp(message)]

        # Normalize Unicode String and convert to lowercase
        tokens = [unicodedata.normalize('NFKD', token).lower() for token in tokens]

        #print('Removing all but chars and numbers...')
        tokens = [re.sub(r'[\W_]+', '',token) for token in tokens] 

        # Remove numbers, but not words that contain numbers.
        tokens = [token for token in tokens if not token.isnumeric()]

        # Remove words that are only one or two characters.
        tokens = [token for token in tokens if len(token) > 2]

        # Remove stopwords 
        stop_words = stopwords.words('english')
        tokens = [word for word in tokens if word not in stop_words]

        # Strip punctuation
        tokens = [strip_punctuation(token) for token in tokens]
        D['Job_offer'].append(row['Job_offer'])
        D['Label'].append(row['Label'])
        D['Traslation'].append(row['Traslation'])
        D['token'].append(' '.join(tokens))
    new_df = pd.DataFrame.from_dict(D)
    return new_df

In [None]:
if flag:
    train_df = pd.read_csv('train_set.csv')
    test_df = pd.read_csv('test_set.csv')
    new_train_df = get_new_df(train_df)
    new_train_df.to_csv('new_train_set.csv', index=False)
    new_test_df = get_new_df(test_df)
    new_test_df.to_csv('new_test_set.csv', index=False)
    new_train_df = add_token_col(new_train_df)
    new_train_df.to_csv('new_train_set.csv', index=False)
    new_test_df = add_token_col(new_test_df)
    new_test_df.to_csv('new_test_set.csv', index=False)
else:
    new_train_df = pd.read_csv('new_train_set.csv')
    new_test_df = pd.read_csv('new_test_set.csv')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score, recall_score,roc_curve,roc_auc_score


# Creare un oggetto CountVectorizer per trasformare i token in vettori numerici
vectorizer = CountVectorizer()


# Creare un oggetto Random Forest Classifier per la classificazione
rf = RandomForestClassifier(n_estimators=200, random_state=42)


# Creare un oggetto Multi-Output Classifier e passare il classificatore Random Forest come parametro
multi_target = MultiOutputClassifier(rf)


# Creare un pipeline con il CountVectorizer e il Multi-Output Classifier
RandomForest = Pipeline([
   ('vectorizer', vectorizer),
   ('multi_target', multi_target)
])

# Addestrare il modello sui dati di addestramento (assumendo che il DataFrame di input sia chiamato "df" e le colonne delle etichette di output siano chiamate "label1" e "label2")
RandomForest.fit(new_train_df['token'], new_train_df[['Label']])
pass

In [None]:
def evaluate(model, new_test_df):
    predictions = model.predict(new_test_df['token'])
    r1=recall_score(predictions, new_test_df['Label'], average='weighted')
    print('recall_score', r1)
    p1=precision_score(predictions, new_test_df['Label'], average='weighted')
    print('precision_score', p1)
    f1 = f1_score(predictions, new_test_df['Label'], average='weighted')
    print('f1_score', f1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score, recall_score,roc_curve,roc_auc_score


# Creare un oggetto CountVectorizer per trasformare i token in vettori numerici
vectorizer = CountVectorizer()


# Creare un oggetto Random Forest Classifier per la classificazione
rf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(15,), random_state=1)


# Creare un oggetto Multi-Output Classifier e passare il classificatore Random Forest come parametro
multi_target = MultiOutputClassifier(rf)


# Creare un pipeline con il CountVectorizer e il Multi-Output Classifier
MLP = Pipeline([
   ('vectorizer', vectorizer),
   ('multi_target', multi_target)
])
MLP.fit(new_train_df['token'], new_train_df[['Label']])
pass

In [None]:
evaluate(RandomForest, new_test_df)

In [None]:
evaluate(MLP, new_test_df)

In [None]:
pickle.dump(RandomForest, open('RND.pkl', 'wb'))
pickle.dump(MLP, open('MLP.pkl', 'wb'))

In [None]:
file_path = 'test_set.csv'

In [None]:
df = pd.read_csv(file_path)
df = get_new_df(df)
df = add_token_col(df)
RND = pickle.load(open('RND.pkl', 'rb'))
MLP = pickle.load(open('MLP.pkl', 'rb'))

In [None]:
evaluate(RND, df)

In [None]:
evaluate(MLP, df)