# Preprocessing

In [1]:
import re
import string

import numpy as np
import pandas as pd
import gensim

from  statistics import mean, stdev
from os import listdir
from os.path import isfile, join

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora

In [2]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec

### Reading CVs

In [22]:
job_opts = [f for f in listdir('./vagas')]

list_job_descs = []
list_cvs_ap = []
list_cvs_inap = []

for e in job_opts:
    list_job_descs.append(
        [f for f in listdir('./vagas/' + e) if f[-3:] == 'txt'])
    list_cvs_ap.append(
        [f for f in listdir('./vagas/' + e + '/Aptos/') if f[-3:] == 'txt'])
    list_cvs_inap.append(
        [f for f in listdir('./vagas/' + e + '/Inaptos/') if f[-3:] == 'txt'])   

In [38]:
'./vagas/' + job_opts[k] + '/Inaptos/' + list_cvs_inap[k][j]

'./vagas/DataScientist/Inaptos/Walterino.txt'

In [35]:
job_descs, cvs_ap, cvs_inap = [], [], []
line_count_descs, line_count_ap, line_count_inap = [], [], []
word_count_descs, word_count_ap, word_count_inap = [], [], []

for k in range(len(job_opts)):
    cvs_ap.append([[] for e in list_cvs_ap[k]])
    cvs_inap.append([[] for e in list_cvs_inap[k]])
    
    df = pd.read_table('./vagas/' + job_opts[k] + '/' + list_job_descs[k][0], header=None)
    job_descs.append([e[0] for e in df.values])
    line_count_descs.append(len(job_descs[k]))
    word_count_descs.append([e.count(' ') for e in job_descs[k]])
    
    for i in range(len(list_cvs_ap[k])):
        df = pd.read_table('./vagas/' + job_opts[k] + '/Aptos/' + list_cvs_ap[k][i], header=None)
        cvs_ap[k][i] = [str(e[0]) for e in df.values]
        line_count_ap.append(len(cvs_ap[k][i]))
        word_count_ap.append([line.count(' ') for line in cvs_ap[k][i]])
    
    for j in range(len(list_cvs_inap[k])):    
        df = pd.read_table('./vagas/' + job_opts[k] + '/Inaptos/' + list_cvs_inap[k][j], header=None)
        cvs_inap[k][j] = [str(e[0]) for e in df.values]
        line_count_inap.append(len(cvs_inap[k][j]))   
        word_count_inap.append([line.count(' ') for line in cvs_inap[k][j]])

ParserError: Error tokenizing data. C error: Expected 1 fields in line 54, saw 2


### Getting some insights

In [6]:
def statistics(line_count, word_count):
    return (mean(line_count),
            stdev(line_count),
            mean([sum(l) for l in word_count]),
            stdev([sum(l) for l in word_count]),
            mean([e for l in word_count for e in l]),
            stdev([e for l in word_count for e in l]))

pd.DataFrame({'descr': statistics(line_count_descs, word_count_descs),
              'cv_ap': statistics(line_count_ap, word_count_ap),
              'cv_inap': statistics(line_count_inap, word_count_inap)},
             index=['mean line', 'stdev', 'mean word', 'stdev word', 'mean word/line', 'stdev word/line'])

Unnamed: 0,cv_ap,cv_inap,descr
mean line,44.783505,50.541353,29.666667
stdev,26.734118,34.082588,6.658328
mean word,672.175258,672.969925,437.333333
stdev word,363.450908,516.007678,406.586194
mean word/line,15.009438,13.315234,14.741573
stdev word/line,24.809468,22.764787,74.483148


### Cleaning CVs

In [7]:
exclude = set(string.punctuation + '–•') 
stop = set(stopwords.words('portuguese') + stopwords.words('english')) 
lemma = WordNetLemmatizer()

def clean(doc):
    punc_free = ''.join(ch for ch in doc.lower() if ch not in exclude)
    stop_free = ' '.join([i for i in punc_free.split() if i not in stop])
    ctrl_free = ' '.join([i for i in stop_free.split() if i not in ['\uf0b7', '\uf0d8', '\uf076']])
    normalized = ' '.join(lemma.lemmatize(word) for word in ctrl_free.split())
    return normalized

clean_job_descs = [[clean(l).split() for l in jd if len(clean(l)) > 0] for jd in job_descs]
clean_cvs_ap = [[[clean(l).split() for l in cv if len(clean(l)) > 0] for cv in cvs] for cvs in cvs_ap]
clean_cvs_inap = [[[clean(l).split() for l in cv if len(clean(l)) > 0] for cv in cvs] for cvs in cvs_inap]
clean_cvs_plus_descs = [[job_descs[i]] + [cvs_ap[i]] + [cvs_inap[i]] for i in range(len(job_descs))]

### Flattening CVs

In [8]:
flat_clean_jds = pd.Series([' '.join(w for l in jd for w in l) \
                                          for jd in clean_job_descs])

flat_clean_cvs_ap = pd.Series([' '.join(w for l in cv for w in l) \
                                             for jd in clean_cvs_ap for cv in jd])

flat_clean_cvs_inap = pd.Series([' '.join(w for l in cv for w in l) \
                                               for jd in clean_cvs_inap for cv in jd])

flat_clean_dataset = pd.concat([flat_clean_jds, flat_clean_cvs_ap, flat_clean_cvs_inap],
                               keys=['job_descs', 'cvs_ap', 'cvs_inap'])

In [9]:
jds_id = pd.DataFrame([(i, 0) for i in range(len(job_descs))],
                         columns=['jds_id', 'cvs_id'])

cvs_ap_id = pd.DataFrame([(i, 'ap_{}'.format(j)) for i in range(len(cvs_ap)) \
                                 for j in range(len(cvs_ap[i]))],
                         columns=['jds_id', 'cvs_id'])

cvs_inap_id = pd.DataFrame([(i, 'inap_{}'.format(j)) for i in range(len(cvs_inap)) \
                                   for j in range(len(cvs_inap[i]))],
                           columns=['jds_id', 'cvs_id'])

dataset_id = pd.concat([jds_id, cvs_ap_id, cvs_inap_id], keys=['jds', 'cvs_ap', 'cvs_inap'])

# TF_IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=500,
                     stop_words=stop)

vectorizer.fit(flat_clean_dataset)
jds_matrix = vectorizer.transform(flat_clean_dataset.loc[['job_descs']])
cvs_matrix = vectorizer.transform(flat_clean_dataset.loc[['cvs_ap', 'cvs_inap']])
tfidf_feature_names = vectorizer.get_feature_names()

In [11]:
cvs_df = pd.DataFrame(cvs_matrix.todense(), columns=tfidf_feature_names)
cvs_df['jds_id'] = dataset_id['jds_id'].loc[['cvs_ap', 'cvs_inap']].tolist()
cvs_df['cvs_id'] = dataset_id['cvs_id'].loc[['cvs_ap', 'cvs_inap']].tolist()

cvs_df

Unnamed: 0,10,2000,2001,2003,2004,2005,2006,2007,2008,2009,...,vivência,vmware,window,word,work,year,áreas,órgãos,jds_id,cvs_id
0,0.042491,0.000000,0.026231,0.049406,0.000000,0.000000,0.062066,0.019354,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.027480,0.053141,0.000000,0.000000,0,ap_0
1,0.038206,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.278433,0.061727,0.033086,...,0.000000,0.000000,0.000000,0.000000,0.049417,0.000000,0.030221,0.000000,0,ap_1
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.050469,0.000000,0,ap_2
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.265475,0.000000,0.220228,0.141651,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,ap_3
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029063,...,0.000000,0.000000,0.041435,0.000000,0.000000,0.000000,0.000000,0.000000,0,ap_4
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.020827,0.022327,...,0.000000,0.000000,0.000000,0.000000,0.016674,0.016122,0.000000,0.000000,0,ap_5
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.099649,0.000000,0.084975,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.096793,0.120653,0.000000,0.000000,0.000000,0,ap_6
7,0.000000,0.000000,0.071201,0.000000,0.000000,0.000000,0.000000,0.105067,0.093172,0.099881,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,ap_7
8,0.030496,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027781,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.038140,0.000000,0.000000,0,ap_8
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.088524,0.000000,0.000000,0.067482,0.000000,0,ap_9


In [12]:
jds_df = pd.DataFrame(jds_matrix.todense(), columns=tfidf_feature_names)
jds_df['jds_id'] = dataset_id['jds_id'].loc[['jds']].tolist()

jds_df

Unnamed: 0,10,2000,2001,2003,2004,2005,2006,2007,2008,2009,...,visando,vivência,vmware,window,word,work,year,áreas,órgãos,jds_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.345049,0.087038,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213169,0.080658,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.215308,0.0,0.0,0.0,0.0,0.0,0.0,0.067656,0.025599,2


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

dic_jds_id = dict(zip(jds_id['jds_id'], jds_matrix.todense()))

cvs_matrix_dense = cvs_matrix.todense()

cos_sim = []

for i, j in enumerate(cvs_df['jds_id']):
    cvs_vec = cvs_matrix_dense[i]
    jds_vec = dic_jds_id[j]
    cos_sim.append(cosine_similarity(jds_vec, cvs_vec)[0][0])
   
cos_sim

[0.091186284020945363,
 0.24466688312264967,
 0.22683009633359674,
 0.088598610290065785,
 0.16877875122425778,
 0.17028561859184749,
 0.14793467602106869,
 0.17309545184498412,
 0.02780190096797722,
 0.25296452207274495,
 0.082892558479540621,
 0.21759444116594576,
 0.092123303389430047,
 0.10594370907183451,
 0.052337941099573689,
 0.073486066006483636,
 0.061655372527053887,
 0.18219342344413997,
 0.079846971722028176,
 0.16589009993942755,
 0.14431577830747905,
 0.29364121850292868,
 0.27207732013329877,
 0.18882697845209356,
 0.041031989831512378,
 0.0,
 0.27155665660739248,
 0.23193058768077537,
 0.15839379758789998,
 0.2976883629870849,
 0.1395698637384894,
 0.0,
 0.14020933477371375,
 0.084557240443426079,
 0.12387069701258818,
 0.025259681860519204,
 0.11831324105390773,
 0.23396487307429251,
 0.37810398366122083,
 0.25798354412445756,
 0.30452091478096627,
 0.35450773965575688,
 0.41784636158609006,
 0.25294451298752396,
 0.30036093182697432,
 0.39286584831970428,
 0.32610250

In [20]:
cvs_df['temp_id'] = cvs_df.index # necessário para ordenar o dataframe após o merge
final_df = pd.merge(jds_df, cvs_df, on='jds_id', suffixes=('_jd', '_cv'), how='inner')
final_df = final_df.sort_values('temp_id')

final_df['cosine_similarity'] = cos_sim

del final_df['temp_id']
del final_df['cvs_id']
#del final_df['jds_id']

cols = final_df.columns.tolist()
cols = [e for e in cols if e != 'jds_id'] + ['jds_id']
final_df = final_df.reindex(columns = cols)

final_df['ground_truth'] = [1 for _ in range(len(flat_clean_cvs_ap))] + \
                           [0 for _ in range(len(flat_clean_cvs_inap))]

final_df

Unnamed: 0,10_jd,2000_jd,2001_jd,2003_jd,2004_jd,2005_jd,2006_jd,2007_jd,2008_jd,2009_jd,...,vmware_cv,window_cv,word_cv,work_cv,year_cv,áreas_cv,órgãos_cv,cosine_similarity,jds_id,ground_truth
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.027480,0.053141,0.000000,0.000000,0.091186,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.049417,0.000000,0.030221,0.000000,0.244667,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.050469,0.000000,0.226830,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.088599,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.041435,0.000000,0.000000,0.000000,0.000000,0.000000,0.168779,0,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.016674,0.016122,0.000000,0.000000,0.170286,0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.096793,0.120653,0.000000,0.000000,0.000000,0.147935,0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.173095,0,1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.038140,0.000000,0.000000,0.027802,0,1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.088524,0.000000,0.000000,0.067482,0.000000,0.252965,0,1


In [29]:
from sklearn.model_selection import train_test_split

ncol = final_df.columns.size - 1
x_train, x_test, y_train, y_test = np.empty([0, ncol]), np.empty([0, ncol]), np.empty([0, ]), np.empty([0, ])

print('number of cvs: [ap, inap, total]')
      
for i in range(len(job_opts)):
    ith_final_df = final_df[final_df['jds_id'] == i]
    
    ith_x_train, ith_x_test, ith_y_train, ith_y_test = train_test_split(ith_final_df.values[:,:-1],
                                                                        ith_final_df.values[:,-1],
                                                                        test_size=0.33, random_state=42,
                                                                        stratify=ith_final_df.values[:,-1])
    x_train = np.concatenate((x_train, ith_x_train), axis=0)
    x_test = np.concatenate((x_test, ith_x_test), axis=0)
    y_train = np.concatenate((y_train, ith_y_train), axis=0)
    y_test = np.concatenate((y_test, ith_y_test), axis=0)
    
    a, b, c = sum(ith_y_train), len(ith_y_train)-sum(ith_y_train), ith_y_train.shape[0]
    d, f, g = sum(ith_y_test), len(ith_y_test)-sum(ith_y_test), ith_y_test.shape[0]
    print('\n *jd {}'.format(i))
    print('   train [{:2.0f}, {:2.0f}, {}]'.format(a, b, c))
    print('   test [{:2.0f}, {:2.0f}, {}]'.format(d, f, g))
    print('   TOTAL [{:2.0f}, {:2.0f}, {}]'.format(a+d, b+f, c+g))
    
id_train = x_train[:, -1]
id_test = x_test[:, -1]
x_train = x_train[:, :-1]
x_test = x_test[:, :-1]

number of cvs: [ap, inap, total]

 *jd 0
   train [25, 58, 83]
   test [13, 29, 42]
   TOTAL [38, 87, 125]

 *jd 1
   train [23, 56, 79]
   test [11, 28, 39]
   TOTAL [34, 84, 118]

 *jd 2
   train [17, 63, 80]
   test [ 8, 32, 40]
   TOTAL [25, 95, 120]


In [105]:
def recall_at_n(id_test, sort_pred, job_opts):
    
    recall_at_1, recall_at_2, recall_at_3 = [], [], []
    all_at_1, all_at_2, all_at_3 = [], [], []
    for i in map(lambda x: int(x), set(id_test)): 
        ith_sort_pred = [e for e in sort_pred if e[0] == i]
    
        total_ap = sum(map(lambda x: x[1], ith_sort_pred))
        total_inap = len(ith_sort_pred) - total_ap
        print('\n{} CVs aptos e {} CVs inaptos p/ a vaga {}\n'.format(int(total_ap),
                                                                      int(total_inap),
                                                                      job_opts[i]))

        ith_sort_pred_ap = [e for e in ith_sort_pred if e[1] == 1.0]
        ith_sort_pred_inap = [e for e in ith_sort_pred if e[1] == 0.0][:9]

        at_1, at_2, at_3 = [], [], []
        for e in ith_sort_pred_ap:
            jth_ith_sort_pred = sorted([e] + ith_sort_pred_inap, key=lambda x: -x[2])
            at_1.append(sum(map(lambda x: x[1], jth_ith_sort_pred[:1])))
            at_2.append(sum(map(lambda x: x[1], jth_ith_sort_pred[:2])))
            at_3.append(sum(map(lambda x: x[1], jth_ith_sort_pred[:3])))
    
        recall_at_1.append(mean(at_1))
        print('Recall@1: {:.2}'.format(recall_at_1[i]))

        recall_at_2.append(mean(at_2))
        print('Recall@2: {:.2}'.format(recall_at_2[i]))
    
        recall_at_3.append(mean(at_3))
        print('Recall@3: {:.2}'.format(recall_at_3[i]))
    
        all_at_1.append(at_1)
        all_at_2.append(at_2)
        all_at_3.append(at_3)
    
    print('\nMean Recall@1: {:.2}'.format(mean([se for e in all_at_1 for se in e])))
    print('Mean Recall@2: {:.2}'.format(mean([se for e in all_at_2 for se in e])))
    print('Mean Recall@3: {:.2}'.format(mean([se for e in all_at_3 for se in e])))

### Regressão Logística

In [30]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced', random_state=42)
lr_model = lr.fit(x_train, y_train)

y_train_lr_pred_prob = lr_model.predict_proba(x_train)
y_test_lr_pred_prob = lr_model.predict_proba(x_test)

In [106]:
print('# Regreção Logística')
for s in ['Train', 'Test']:
    print('\n## {} set'.format(s))
    if s == 'Train':
        sort_pred = sorted(zip(id_train, y_train, y_train_lr_pred_prob[:,1]), key=lambda x: -x[2])
    elif s == 'Test':
        sort_pred = sorted(zip(id_test, y_test, y_test_lr_pred_prob[:,1]), key=lambda x: -x[2])

    recall_at_n(id_test, sort_pred, job_opts)

# Regreção Logística

## Train set

25 CVs aptos e 58 CVs inaptos p/ a vaga Consultor de Desenvolvimento de Novos Negocios

Recall@1: 0.8
Recall@2: 0.8
Recall@3: 0.88

23 CVs aptos e 56 CVs inaptos p/ a vaga Consultor de Relacao com Investidores

Recall@1: 0.13
Recall@2: 0.13
Recall@3: 0.39

17 CVs aptos e 63 CVs inaptos p/ a vaga Gerente Executivo de Sustentabilidade

Recall@1: 0.82
Recall@2: 0.94
Recall@3: 0.94

Mean Recall@1: 0.57
Mean Recall@2: 0.6
Mean Recall@3: 0.72

## Test set

13 CVs aptos e 29 CVs inaptos p/ a vaga Consultor de Desenvolvimento de Novos Negocios

Recall@1: 0.0
Recall@2: 0.077
Recall@3: 0.077

11 CVs aptos e 28 CVs inaptos p/ a vaga Consultor de Relacao com Investidores

Recall@1: 0.36
Recall@2: 0.82
Recall@3: 0.91

8 CVs aptos e 32 CVs inaptos p/ a vaga Gerente Executivo de Sustentabilidade

Recall@1: 0.62
Recall@2: 0.75
Recall@3: 0.75

Mean Recall@1: 0.28
Mean Recall@2: 0.5
Mean Recall@3: 0.53


### Redes Neurais

In [107]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(1024, 5), random_state=42)
nn_model = nn.fit(x_train, y_train)

y_train_nn_pred_prob = nn_model.predict_proba(x_train)
y_test_nn_pred_prob = nn_model.predict_proba(x_test)

In [108]:
print('Rede Neural')
for s in ['Train', 'Test']:
    print('\n## {} set'.format(s))
    if s == 'Train':
        sort_pred = sorted(zip(id_train, y_train, y_train_nn_pred_prob[:,1]), key=lambda x: -x[2])
    elif s == 'Test':
        sort_pred = sorted(zip(id_test, y_test, y_test_nn_pred_prob[:,1]), key=lambda x: -x[2])

    recall_at_n(id_test, sort_pred, job_opts)

Rede Neural

## Train set

25 CVs aptos e 58 CVs inaptos p/ a vaga Consultor de Desenvolvimento de Novos Negocios

Recall@1: 1.0
Recall@2: 1.0
Recall@3: 1.0

23 CVs aptos e 56 CVs inaptos p/ a vaga Consultor de Relacao com Investidores

Recall@1: 1.0
Recall@2: 1.0
Recall@3: 1.0

17 CVs aptos e 63 CVs inaptos p/ a vaga Gerente Executivo de Sustentabilidade

Recall@1: 1.0
Recall@2: 1.0
Recall@3: 1.0

Mean Recall@1: 1.0
Mean Recall@2: 1.0
Mean Recall@3: 1.0

## Test set

13 CVs aptos e 29 CVs inaptos p/ a vaga Consultor de Desenvolvimento de Novos Negocios

Recall@1: 0.0
Recall@2: 0.23
Recall@3: 0.69

11 CVs aptos e 28 CVs inaptos p/ a vaga Consultor de Relacao com Investidores

Recall@1: 0.36
Recall@2: 0.36
Recall@3: 0.45

8 CVs aptos e 32 CVs inaptos p/ a vaga Gerente Executivo de Sustentabilidade

Recall@1: 0.38
Recall@2: 0.88
Recall@3: 0.88

Mean Recall@1: 0.22
Mean Recall@2: 0.44
Mean Recall@3: 0.66


### Random Forest

In [109]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=300, max_depth=10, class_weight='balanced', random_state=42)
rf_model = rf.fit(x_train, y_train)

y_train_rf_pred_prob = rf_model.predict_proba(x_train)
y_test_rf_pred_prob = rf_model.predict_proba(x_test)

In [110]:
print('# Random Forest')
for s in ['Train', 'Test']:
    print('\n## {} set'.format(s))
    if s == 'Train':
        sort_pred = sorted(zip(id_train, y_train, y_train_rf_pred_prob[:,1]), key=lambda x: -x[2])
    elif s == 'Test':
        sort_pred = sorted(zip(id_test, y_test, y_test_rf_pred_prob[:,1]), key=lambda x: -x[2])

    recall_at_n(id_test, sort_pred, job_opts)

# Random Forest

## Train set

25 CVs aptos e 58 CVs inaptos p/ a vaga Consultor de Desenvolvimento de Novos Negocios

Recall@1: 1.0
Recall@2: 1.0
Recall@3: 1.0

23 CVs aptos e 56 CVs inaptos p/ a vaga Consultor de Relacao com Investidores

Recall@1: 1.0
Recall@2: 1.0
Recall@3: 1.0

17 CVs aptos e 63 CVs inaptos p/ a vaga Gerente Executivo de Sustentabilidade

Recall@1: 1.0
Recall@2: 1.0
Recall@3: 1.0

Mean Recall@1: 1.0
Mean Recall@2: 1.0
Mean Recall@3: 1.0

## Test set

13 CVs aptos e 29 CVs inaptos p/ a vaga Consultor de Desenvolvimento de Novos Negocios

Recall@1: 0.0
Recall@2: 0.23
Recall@3: 0.38

11 CVs aptos e 28 CVs inaptos p/ a vaga Consultor de Relacao com Investidores

Recall@1: 0.36
Recall@2: 0.73
Recall@3: 0.82

8 CVs aptos e 32 CVs inaptos p/ a vaga Gerente Executivo de Sustentabilidade

Recall@1: 0.12
Recall@2: 0.88
Recall@3: 0.88

Mean Recall@1: 0.16
Mean Recall@2: 0.56
Mean Recall@3: 0.66


In [None]:
sorted(zip(rf_model.feature_importances_, final_df.columns[:-2]), key=lambda x: -x[0])

In [None]:
#####

In [None]:
sorted(list(zip(tfidf_feature_names,tfidf_matrix[100,:].todense().tolist()[0])), key=lambda x: -x[1])

In [None]:
tfidf_inv_matrix = vectorizer.inverse_transform(tfidf_matrix)

In [None]:
len(tfidf_inv_matrix[2])

In [None]:
id_cvs[0], tfidf_inv_matrix[2]

# LDA

### Preparing Document-Term Matrix

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(clean_cvs_plus_desc)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
cv_term_matrix = [dictionary.doc2bow(doc) for doc in clean_cvs_plus_desc]

# [len(e) for e in [cv for cv in clean_cvs_ok]]
# [1523, 705, 668, 935, 687, 748, 313, 370, 583, 423, 976]

# [len(e) for e in [dictionary.doc2bow(cv) for cv in clean_cvs_ok]]
# [740, 372, 411, 523, 431, 469, 198, 226, 366, 300, 513]

In [None]:
print(len(cv_term_matrix))
print([len(e) for e in cv_term_matrix])

In [None]:
cv_term_matrix_2 = [sorted([e for e in cv if e[1] >= 5], key=lambda x: x[1], reverse=True) \
                    for cv in cv_term_matrix]

print(len(cv_term_matrix_2))
print([len(e) for e in cv_term_matrix_2])

In [None]:
[e for e in list(dictionary.items()) if e[1] == 'inglês']

### Running LDA Model

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(cv_term_matrix_2, num_topics=10, id2word = dictionary, passes=500)

### Results

In [None]:
n = 10

for i in range(n):
    print(ldamodel.print_topics(num_topics=n, num_words=5)[i][1])

# doc2vec

In [None]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import smart_open
import time

import os.path

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

from collections import defaultdict

from random import shuffle
import datetime

import random


import random
from IPython.display import HTML

from gensim.models import KeyedVectors

import logging

### Load corpus

In [None]:
dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

start = time.clock()

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with open(filename, 'wb') as f:
                f.write(r.content)
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()

    # Concatenate and normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
    alldata = u''
    for fol in folders:
        temp = u''
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        for txt in txt_files:
            with smart_open.smart_open(txt, "rb") as t:
                t_clean = t.read().decode("utf-8")
                for c in control_chars:
                    t_clean = t_clean.replace(c, ' ')
                temp += t_clean
            temp += "\n"
        temp_norm = normalize_text(temp)
        with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
            n.write(temp_norm.encode("utf-8"))
        alldata += temp_norm

    with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(alldata.splitlines()):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

end = time.clock()
print ("Total running time: ", end-start)

In [None]:
import os.path
assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"

In [None]:
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # Will hold all docs in original order
with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

### Set-up Doc2Vec Training & Evaluation Models

In [None]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

# Speed up setup by sharing results of the 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM w/ concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

In [None]:
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

### Predictive Evaluation Methods

In [None]:
@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

### Bulk Training

In [None]:
best_error = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [None]:
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()
            
        # Evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

### Achieved Sentiment-Prediction Accuracy

In [None]:
# Print best error rates achieved
print("Err rate Model")
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

### Examining Results

#### Are inferred vectors close to the precalculated ones?

In [None]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

#### Do close documents seem more related than distant ones?

In [None]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))

#### Do the word vectors show useful similarities?

In [None]:
word_models = simple_models[:]

In [None]:
list(word_models[0].wv.index2word)

In [None]:
# pick a random word with a suitable number of occurences
while True:
    word = random.choice(word_models[0].wv.index2word)
    if word_models[0].wv.vocab[word].count > 10:
        break
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

#### Are the word vectors from this dataset any good at analogies?

In [None]:
# Download this file: https://github.com/nicholas-leonard/word2vec/blob/master/questions-words.txt
# and place it in the local directory
# Note: this takes many minutes
if os.path.isfile('questions-words.txt'):
    for model in word_models:
        sections = model.accuracy('questions-words.txt')
        correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])
        print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))

### Slop

In [None]:
This cell left intentionally erroneous.

In [None]:
w2v_g100b = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
w2v_g100b.compact_name = 'w2v_g100b'
word_models.append(w2v_g100b)

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

In [None]:
%load_ext autoreload
%autoreload 2