In [1]:
from pandas import DataFrame, ExcelFile, read_csv, read_excel 
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
from sklearn.externals import joblib
import datetime

  from numpy.core.umath_tests import inner1d


In [3]:
path = '../2016 - 2018 Taleo Job Postings - Updated.xlsx'
xls = ExcelFile(path, sep = ';', encoding = 'utf-8')
df = read_excel(xls, 'All')

In [4]:
qualifications_external = DataFrame(df[['Qualifications - External']])
qualifications_external.dropna(inplace=True)

In [5]:
#TODO: find a better split than ' - ' because of cases like '2 - 4 years in a directly related position.(Asset)', 
#yet similar cases like '4 – 6 years commercial experience (asset).' has a bigger hyphen. 
#The similar cases appear way more than the first one.

all_values = [value.replace('\xa0', '').lower().strip() for row in qualifications_external.values for value in ''.join(row).split(' - ')[1:]]

In [6]:
#build the classifier

In [7]:
    def transform_data_to_tfidf(texts, tfidf_model):
        strs = []
        for text in texts:
            stra = tokenize_text(text)
            stra = ' '.join(stra)
            strs.append(stra)
        response = tfidf_model.transform(strs)
        return response

In [8]:
    def get_bow(tokenized_texts):
        bow = []
        for tokenized_text in tokenized_texts:
            bow.extend(tokenized_text)
        return bow

In [9]:
    def tokenize_text(text):
        return word_tokenize(text)

In [10]:
    def get_tf_idf_model(train_X, stopwords):
        tokenized_texts = [tokenize_text(text) for text in train_X]
        bow = get_bow(tokenized_texts)
        tfidf = TfidfVectorizer(tokenizer=tokenize_text, stop_words=stopwords)
        tfs = tfidf.fit_transform(bow)
        return tfidf

In [11]:
    def generating_stratified_k_folds(X, y, n_folds, stopwords_):
        skf = StratifiedKFold(n_splits=n_folds, random_state=0)
        k_folds = []
        for train, test in skf.split(X, y):
            train_x = [X[i] for i in train]
            test_x = [X[i] for i in test]

            tfidf = get_tf_idf_model(train_x, stopwords_)
            #joblib.dump(tfidf, 'tfidf'+self.time_of_training_+'.pkl')

            train_x = transform_data_to_tfidf(train_x, tfidf)
            test_x = transform_data_to_tfidf(test_x, tfidf)

            train_y = [y[i] for i in train]
            test_y = [y[i] for i in test]

            k_folds.append([train_x, train_y, test_x, test_y])
            break
        return tfidf, k_folds

In [12]:
total = read_csv('total_jp.csv', sep=';', encoding='utf-8', usecols=['CA', 'CP', 'EI', 'ER', 'TS'])

examples = dict()

for column in total.columns:
    selected_column = DataFrame(total[[column]])
    selected_column.dropna(inplace=True)
    column_values = [value.lower().strip() for row in selected_column.values for value in ''.join(row).split('$$$%%%&&&')]
    examples[column] = column_values
    
X = []
y = []

for key, values in examples.items():
    for value in values:
        X.append(value)
        y.append(key)

In [None]:
en_stopwords = stopwords.words('english')
tfidfmodel, kfolds = generating_stratified_k_folds(X, y, 5, en_stopwords)

In [None]:
#tuned_parameters = {'n_estimators':[50, 200], 'criterion':['gini', 'entropy'],
#                    'class_weight': ['balanced', 'balanced_subsample' or None]}
#clf = GridSearchCV(RandomForestClassifier(random_state = 12), tuned_parameters, cv=5, return_train_score=True)
#model = clf.fit(kfolds[0][0], kfolds[0][1])

In [None]:
tuned_parameters = {'n_estimators':[50]}
clf = GridSearchCV(RandomForestClassifier(random_state = 12), tuned_parameters, cv=5, return_train_score=True)
model = clf.fit(kfolds[0][0], kfolds[0][1])
print(model.classes_)
confusion_matrix(model.predict(kfolds[0][2]), kfolds[0][3])

In [None]:
confusion_matrix(model.predict(kfolds[0][0]), kfolds[0][1])

In [None]:
time_of_training_ = (str(datetime.datetime.now().date()) + " " + str(datetime.datetime.now().time())).replace(':', '')
joblib.dump(model, 'classifier'+time_of_training_+'.pkl')
joblib.dump(tfidfmodel, 'tfidf_model'+time_of_training_+'.pkl')

In [121]:
#rfc = RandomForestClassifier(n_estimators=50, random_state=12)
#rfc.fit(kfolds[0][0], kfolds[0][1])
#conf_matrix = confusion_matrix(rfc.predict(kfolds[0][2]), kfolds[0][3])
#conf_matrix

array([[ 260,   10,    2,    0,    0,    1],
       [  31,  381,    0,    1,    2,   10],
       [   0,    0,  231,   27,    8,    1],
       [   0,    0,   51,  493,   30,    2],
       [   1,   10,   14,   63, 1028,   27],
       [   8,   17,    4,   11,   17, 1992]], dtype=int64)

In [156]:
taleo_tests = all_values[0:20]
taleo_tfidf = transform_data_to_tfidf(taleo_tests, tfidfmodel)
print([(item, predicted_class) for item, predicted_class in zip(taleo_tests, model.predict(taleo_tfidf))])

[('bs in chemical engineering, environmental engineering or other technical degree', 'CA'), ('minimum of 2 years of experience as an environmental engineer/specialist', 'ER'), ('proficient in use of excel, word and air services', 'TS'), ('ability to negotiate with regulatory agencies assets:', 'TS'), ("master's degree in engineering or mba", 'CA'), ('professional engineers license', 'CP'), ('minimum of 2 years environmental experience in the petrochemical industry', 'EI'), ('or other relevant experience the lima refining company is an equal opportunity employer. all qualified applicants will receive consideration for employment without regard to race, color, religion, sex national origin, veteran status, disability, sexual orientation or gender identity.', 'ER'), ("bachelor's degree in engineering", 'CA'), ('professional engineer (p.eng)', 'CP'), ('specialization in engineering, process or mechanical engineering preferred', 'CA'), ('strong knowledge of capital afes and project delivery

In [146]:
model.classes_

array(['CA', 'CP', 'EI', 'ER', 'TS', 'responsabilities'], dtype='<U16')