In [1]:
from pandas import DataFrame, ExcelFile, read_csv, read_excel 
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
from sklearn.externals import joblib
import datetime

  from numpy.core.umath_tests import inner1d


In [37]:
path = 'Notebooks/husky/database/2016 - 2018 Taleo Job Postings - Updated.xlsx'
xls = ExcelFile(path, sep = ';', encoding = 'utf-8')
df = read_excel(xls, 'All')

In [38]:
qualifications_external = DataFrame(df[['Qualifications - External']])
qualifications_external.dropna(inplace=True)

In [39]:
#TODO: find a better split than ' - ' because of cases like '2 - 4 years in a directly related position.(Asset)', 
#yet similar cases like '4 – 6 years commercial experience (asset).' has a bigger hyphen. 
#The similar cases appear way more than the first one.

all_values = [value.replace('\xa0', '').lower().strip() for row in qualifications_external.values for value in ''.join(row).split(' - ')[1:]]

In [40]:
#build the classifier

In [41]:
    def transform_data_to_tfidf(texts, tfidf_model):
        strs = []
        for text in texts:
            stra = tokenize_text(text)
            stra = ' '.join(stra)
            strs.append(stra)
        response = tfidf_model.transform(strs)
        return response

In [42]:
    def get_bow(tokenized_texts):
        bow = []
        for tokenized_text in tokenized_texts:
            bow.extend(tokenized_text)
        return bow

In [43]:
    def tokenize_text(text):
        return word_tokenize(text)

In [44]:
    def get_tf_idf_model(train_X, stopwords):
        tokenized_texts = [tokenize_text(text) for text in train_X]
        bow = get_bow(tokenized_texts)
        tfidf = TfidfVectorizer(tokenizer=tokenize_text, stop_words=stopwords)
        tfs = tfidf.fit_transform(bow)
        return tfidf

In [45]:
    def generating_stratified_k_folds(X, y, n_folds, stopwords_):
        skf = StratifiedKFold(n_splits=n_folds, random_state=0)
        k_folds = []
        for train, test in skf.split(X, y):
            train_x = [X[i] for i in train]
            test_x = [X[i] for i in test]

            tfidf = get_tf_idf_model(train_x, stopwords_)
            #joblib.dump(tfidf, 'tfidf'+self.time_of_training_+'.pkl')

            train_x = transform_data_to_tfidf(train_x, tfidf)
            test_x = transform_data_to_tfidf(test_x, tfidf)

            train_y = [y[i] for i in train]
            test_y = [y[i] for i in test]

            k_folds.append([train_x, train_y, test_x, test_y])
            break
        return tfidf, k_folds

In [46]:
total = read_csv('Notebooks/husky/database/mais_completo/total_jp.csv', sep=';', encoding='utf-8', usecols=['CA', 'CP', 'EI', 'ER', 'TS'])

examples = dict()

for column in total.columns:
    selected_column = DataFrame(total[[column]])
    selected_column.dropna(inplace=True)
    column_values = [value.lower().strip() for row in selected_column.values for value in ''.join(row).split('$$$%%%&&&')]
    examples[column] = column_values
    
X = []
y = []

for key, values in examples.items():
    for value in values:
        X.append(value)
        y.append(key)

In [47]:
en_stopwords = stopwords.words('english')
tfidfmodel, kfolds = generating_stratified_k_folds(X, y, 5, en_stopwords)

In [48]:
#tuned_parameters = {'n_estimators':[50, 200], 'criterion':['gini', 'entropy'],
#                    'class_weight': ['balanced', 'balanced_subsample' or None]}
#clf = GridSearchCV(RandomForestClassifier(random_state = 12), tuned_parameters, cv=5, return_train_score=True)
#model = clf.fit(kfolds[0][0], kfolds[0][1])

In [50]:
tuned_parameters = {'n_estimators':[50]}
clf = GridSearchCV(RandomForestClassifier(random_state = 12), tuned_parameters, cv=5, return_train_score=True)
model = clf.fit(kfolds[0][0], kfolds[0][1])
print(model.classes_)
confusion_matrix(model.predict(kfolds[0][2]), kfolds[0][3])

['CA' 'CP' 'EI' 'ER' 'TS']


array([[ 539,    9,    3,    5,    4],
       [  46,  810,    0,    2,   25],
       [   0,    0,  483,   52,    7],
       [   5,    3,   87,  984,   78],
       [  10,   14,   31,  146, 2056]], dtype=int64)

In [51]:
print((kfolds[0][0]).shape)
confusion_matrix(model.predict(kfolds[0][0]), kfolds[0][1])

(21592, 5710)


array([[2375,   28,    0,    0,    2],
       [  19, 3307,    3,    7,   15],
       [   0,    1, 2358,   39,    8],
       [   1,    1,   37, 4605,   63],
       [   4,    6,   18,  105, 8590]], dtype=int64)

In [52]:
time_of_training_ = (str(datetime.datetime.now().date()) + " " + str(datetime.datetime.now().time())).replace(':', '')
joblib.dump(model, 'classificador_secao_'+time_of_training_+'.pkl')
joblib.dump(tfidfmodel, 'tfidf_model'+time_of_training_+'.pkl')

['tfidf_model2019-01-31 162010.669333.pkl']

In [121]:
#rfc = RandomForestClassifier(n_estimators=50, random_state=12)
#rfc.fit(kfolds[0][0], kfolds[0][1])
#conf_matrix = confusion_matrix(rfc.predict(kfolds[0][2]), kfolds[0][3])
#conf_matrix

array([[ 260,   10,    2,    0,    0,    1],
       [  31,  381,    0,    1,    2,   10],
       [   0,    0,  231,   27,    8,    1],
       [   0,    0,   51,  493,   30,    2],
       [   1,   10,   14,   63, 1028,   27],
       [   8,   17,    4,   11,   17, 1992]], dtype=int64)

In [156]:
taleo_tests = all_values[0:20]

taleo_tfidf = transform_data_to_tfidf(taleo_tests, tfidfmodel)
print([(item, predicted_class) for item, predicted_class in zip(taleo_tests, model.predict(taleo_tfidf))])

[('bs in chemical engineering, environmental engineering or other technical degree', 'CA'), ('minimum of 2 years of experience as an environmental engineer/specialist', 'ER'), ('proficient in use of excel, word and air services', 'TS'), ('ability to negotiate with regulatory agencies assets:', 'TS'), ("master's degree in engineering or mba", 'CA'), ('professional engineers license', 'CP'), ('minimum of 2 years environmental experience in the petrochemical industry', 'EI'), ('or other relevant experience the lima refining company is an equal opportunity employer. all qualified applicants will receive consideration for employment without regard to race, color, religion, sex national origin, veteran status, disability, sexual orientation or gender identity.', 'ER'), ("bachelor's degree in engineering", 'CA'), ('professional engineer (p.eng)', 'CP'), ('specialization in engineering, process or mechanical engineering preferred', 'CA'), ('strong knowledge of capital afes and project delivery

In [61]:
path = 'Cleansed Content Taleo Format.xlsx'
xls = ExcelFile(path, sep = ';', encoding = 'utf-8')
df = read_excel(xls, 'Sheet1')
qualifications_external = DataFrame(df[['Qualifications - External']])
qualifications_external.dropna(inplace=True)
all_values = [value.replace('\xa0', '').lower().strip() for row in qualifications_external.values for value in ''.join(row).split('\n')[1:]]

taleo_tfidf = transform_data_to_tfidf(all_values, tfidfmodel)
sd=[(item, predicted_class) for item, predicted_class in zip(all_values, model.predict(taleo_tfidf))]





print(sd[4])

('bachelors degree in human resources', 'CA')


In [68]:
output_df = DataFrame()
ds={"CA":[],"CP":[],"EI":[],"ER":[],"TS":[]}

for i in sd:
    ds[i[1]].append(i[0])
for i in ds.keys():print(len(ds[i]))

for i in ds.keys():
    while len(ds[i]) < 132:
        ds[i].append("Ragav")
#ds
output_df = DataFrame(ds)
print(output_df)
output_df.to_csv("")

#output_df["CA"]=ds["CA"]
#output_df["CP"]=ds["CP"]
    

92
83
34
132
19
                                                    CA  \
0                                      doctoral degree   
1                   masters of business administration   
2                  bachelors degree in human resources   
3                                bachelors of commerce   
4                       certificate in adult education   
5       bachelors degree in organizational development   
6                    post-secondary degree in business   
7                       bachelors degree in psychology   
8                  bachelors degree in social sciences   
9                                diploma in psychology   
10                          diploma in social sciences   
11                              bachelors of education   
12                      bachelors degree in accounting   
13                         bachelors degree in finance   
14                        masters degree in accounting   
15                           masters degree in finance  

In [146]:
output_df = DataFrame()
        #output_df['Position Title'] = getattr(db, 'Requisition Title')
        #output_df['Department'] = db['Job Family']
        #db['External: Responsibilities'] = db['External: Responsibilities'].fillna('')
        #output_df['responsabilities'] = db['External: Responsibilities'].apply(lambda x: '$$$%%%&&&'.join(x.strip().split('-')[1:]) if x else None)
        #output_df['POSITION SUMMARY'] = db['Original Description Section - External'] 
        #output_df['Date Revised'] = db['Req. Creation Date']
        #db['Qualifications - External'] = db['Qualifications - External'].fillna('')
output_df['CA'] =  db['Qualifications - External'].apply(lambda x: self.categorize_items(x, 'CA'))
output_df['CP'] =  db['Qualifications - External'].apply(lambda x: self.categorize_items(x, 'CP'))
output_df['EI'] =  db['Qualifications - External'].apply(lambda x: self.categorize_items(x, 'EI'))
output_df['ER'] =  db['Qualifications - External'].apply(lambda x: self.categorize_items(x, 'ER'))
output_df['TS'] =  db['Qualifications - External'].apply(lambda x: self.categorize_items(x, 'TS'))

array(['CA', 'CP', 'EI', 'ER', 'TS', 'responsabilities'], dtype='<U16')