In [1]:
import re

from mean_f1 import *  ## library with mean_f1 metric
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.manifold import TSNE
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import f1_score, roc_auc_score
import xgboost as xgb
 

from lxml import etree
from lxml import objectify

import nltk
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from multiprocessing import Pool

N_PROCS = 8



In [2]:
##   Reading XML data

def parse_tree_struct((xml_root, k)):

    dt_row = dict()   ## for storing rows
    dt_industr =  dict()  ## for storing industries
    
    b_childs = etree.fromstring(xml_root)
    dt_row['id'] = b_childs.get("id")
    b_childs = b_childs.getchildren()
    

    for tree_elem in b_childs:
        tag_val = tree_elem.tag
        num_val = tree_elem.text
        dt_row[tag_val] = num_val

        if tree_elem.tag == 'term':
            new_tag = tree_elem.getchildren()[0].tag
            dt_row[new_tag] = tree_elem.getchildren()[0].text

        if tree_elem.tag == 'requirement':
            new_tag = tree_elem.getchildren()[0].tag
            dt_row[new_tag] = tree_elem.getchildren()[0].text

        if tree_elem.tag == 'industries':
            for ind_elem in tree_elem.getchildren():
                dt_industr [ind_elem.text] = 1

    dt_row = pd.Series(dt_row)
    dt_row['y_industr'] = dt_industr
    
    return dt_row 

def read_data_gz(filename_str):
    
    # initiate parser
    parser = etree.XMLParser(ns_clean=True, encoding='utf-8')
    data_train_tree = etree.parse(filename_str,parser)

    # set up column names
    col_names = ['id','update-date','industries','job-name','salary','currency','employment','schedule','description','experience','contract','y_industr']

    root = data_train_tree.getroot().getchildren() ##[0:1000] ## generate sample   
    root_data_size = len(root)
    
    ## processing xml data in pool
    pool = Pool(processes = N_PROCS)
    dt_data = pool.map(parse_tree_struct, zip(map (etree.tostring, root),
                                                    np.array(range(root_data_size))))
    pool.close()
    pool.join()
        
    dt_data = pd.DataFrame(dt_data,columns=col_names)    
    tt = map (lambda x: pd.Series(dict(x)), dt_data['y_industr'])
    y_train = pd.DataFrame(tt)
    
    dt_data = dt_data.drop('y_industr',axis = 1)
    ## dt_train = dt_train.append(dt_row, ignore_index=True)
    ## y_train = y_train.append(dt_industr, ignore_index=True)
        
    return dt_data, y_train ##dt_train, y_train
    
dt_train, y_class = read_data_gz ("Data/train.xml.gz")
dt_test, y_test = read_data_gz ("Data/test.xml.gz")

print "data loaded"

data loaded


In [3]:
print dt_train.shape
dt_train.head()

(10000, 11)


Unnamed: 0,id,update-date,industries,job-name,salary,currency,employment,schedule,description,experience,contract
0,174673,2015-08-18 00:31:14 GMT+4,\n,Бетонщик,,,Полная занятость,Полный день,<p>В Компанию с центральным офисом в Москве (в...,1-3 года,постоянный
1,130222,2015-08-19 00:15:58 GMT+4,\n,Руководитель интернет-проекта (обувь),от 50000 до 70000,RUR,Полная занятость,Полный день,<p><strong>Должностные обязанности:</strong></...,3-6 лет,постоянный
2,121331,2015-08-19 01:44:02 GMT+4,\n,Врач-офтальмолог,,,Полная занятость,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,3-6 лет,постоянный
3,30170,2015-08-20 11:29:29 GMT+4,\n,Специалист аналитического отдела,,,Проектная/Временная работа,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,1-3 года,постоянный
4,43059,2015-08-20 11:01:03 GMT+4,\n,Руководитель проекта лаборатории нефтехимическ...,,,Полная занятость,Полный день,<p>Руководитель проекта лаборатории нефтехимич...,1-3 года,постоянный


In [4]:
# joining all data
dt_all = pd.concat((dt_train,dt_test),axis=0)
dt_all.index = range (dt_all.shape[0])

print max(dt_all.index)

19999


In [5]:
## removing nans from y_class
y_class [np.isnan(y_class)] = 0

In [6]:
dt_all = dt_all.drop(['industries','currency','update-date'], axis=1)

In [7]:
# s - string salary
def process_salary(salary_str):
    l_bound = np.nan
    h_bound = np.nan
    try:
        m = re.search(u'от ([0-9]*)', salary_str)
        if m: 
            l_bound = float(m.group(1))
        m = re.search(u'до ([0-9]*)', salary_str)
        if m: 
            h_bound = float(m.group(1))
    except:
        pass
    return l_bound, h_bound  


salary_hl = pd.DataFrame(map(process_salary, dt_all['salary']),columns=['low_sal','high_sal'])

print dt_all.shape
print salary_hl.shape

dt_all = dt_all.join(salary_hl)
dt_all = dt_all.drop('salary', axis = 1)

(20000, 8)
(20000, 2)


In [8]:
## replacing nans with approximations
mean_rel = np.mean(dt_all['high_sal']/dt_all['low_sal'])

null_rows = np.isnan(dt_all['low_sal'])
dt_all.loc[null_rows,'low_sal'] = dt_all['high_sal'][null_rows] / mean_rel

null_rows = np.isnan(dt_all['high_sal'])
dt_all.loc[null_rows,'high_sal'] = dt_all['low_sal'][null_rows] * mean_rel


## replacing all missing values with zeroes
dt_all['no_sal_data'] = 0
null_rows = np.isnan(dt_all['high_sal'])
dt_all.loc[null_rows,'no_sal_data'] = 1
dt_all.loc[null_rows,'low_sal'] = 0
dt_all.loc[null_rows,'high_sal'] = 0

In [9]:
dt_all.head()

Unnamed: 0,id,job-name,employment,schedule,description,experience,contract,low_sal,high_sal,no_sal_data
0,174673,Бетонщик,Полная занятость,Полный день,<p>В Компанию с центральным офисом в Москве (в...,1-3 года,постоянный,0,0,1
1,130222,Руководитель интернет-проекта (обувь),Полная занятость,Полный день,<p><strong>Должностные обязанности:</strong></...,3-6 лет,постоянный,50000,70000,0
2,121331,Врач-офтальмолог,Полная занятость,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,3-6 лет,постоянный,0,0,1
3,30170,Специалист аналитического отдела,Проектная/Временная работа,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,1-3 года,постоянный,0,0,1
4,43059,Руководитель проекта лаборатории нефтехимическ...,Полная занятость,Полный день,<p>Руководитель проекта лаборатории нефтехимич...,1-3 года,постоянный,0,0,1


In [10]:
## Creating one-hot features for category columns

def make_bin_cols(table_data):
   pass 

col_names = ['employment','schedule','experience','contract']
dt_bin = pd.DataFrame()
dt = dt_all[col_names]
j = 0

for j in range(dt.shape[1]):
    for row_val in np.unique(dt.iloc[:,j]):
        new_col = np.zeros(dt.shape[0])   
        new_col[np.array(dt.iloc[:,j] == row_val)] = 1
        col_name = dt.columns.values[j] + "_" + row_val
        dt_bin[col_name] = new_col
        
dt_all = dt_all.join(dt_bin)
dt_all = dt_all.drop(col_names, axis =1 )
##pd.Series(np.unique(dt_all['employment']))

In [11]:
def clean_str(mystr):
    text = re.sub(u"<.*?>","", mystr)
    text = re.sub(u"&quot;","", text)
    return text
    
dt_all['description'] = map(clean_str, dt_all['description'])
dt_all['job-name'] = map(clean_str, dt_all['job-name'])


In [12]:
dt_all.shape

(20000, 20)

In [None]:
##  nltk.download("punkt")
##
##  http://www.markhneedham.com/blog/2015/02/15/pythonscikit-learn-calculating-tfidf-on-how-i-met-your-mother-transcripts/
##

def tokenize(text):
    stemmer = SnowballStemmer("russian")
    text = re.sub(u"[^a-zA-Z0-9а-яА-Я -]","",text)
    text = re.sub(u"--","",text) 
    text = re.sub(u"($-)|( -)|(-^)","",text)
    text = re.sub(u" 000",u"000",text)

    tokens = nltk.word_tokenize(text)
    stems = []
    
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems


def make_tfidf_decomposition (train_dt, test_dt, pca_size):

    rus_stopwords = stopwords.words("russian")

    tfidf = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 5), min_df=15, stop_words = rus_stopwords)

    tfs_train = tfidf.fit_transform(train_dt)
    tfs_test  = tfidf.transform(test_dt)

    pca_tf = PCA(n_components = pca_size)
    ## pca_tf = TSNE(n_components = pca_size)

    tfs_train = pd.DataFrame(pca_tf.fit_transform (tfs_train.todense()))
    tfs_test  = pd.DataFrame(pca_tf.transform (tfs_test.todense()))
    
    feature_names = pd.Series(tfidf.get_feature_names())

    return tfs_train, tfs_test, feature_names

# pd.set_option('display.max_rows', 1000)
# print feature_names
# pd.set_option('display.max_rows', 100)

In [None]:
## Create TFIDF vectors from description data
##

n_shape = dt_all.shape[0]/2
n_shape
descr_train, descr_test, feat_name = make_tfidf_decomposition (dt_all['description'][0:n_shape], dt_all['description'][n_shape:], pca_size = 200)

descr_train.columns = map (lambda x: 'descr ' + (str(x)), descr_train.columns.values)
descr_test.columns  = map (lambda x: 'descr ' + (str(x)), descr_test.columns.values)

descr_all = pd.concat((descr_train,descr_test),axis=0)
descr_all.index = range(descr_all.shape[0])

print dt_all.shape

dt_all = dt_all.join(descr_all)

print descr_all.shape, dt_all.shape

## del descr_train, descr_test, descr_all

In [None]:
## Create TFIDF vectors from job-name data
##

n_shape = dt_all.shape[0]/2
n_shape
title_train, title_test, feat_name = make_tfidf_decomposition (dt_all['job-name'][0:n_shape], dt_all['job-name'][n_shape:], pca_size = 30)

title_train.columns = map (lambda x: 'title ' + (str(x)), title_train.columns.values)
title_test.columns  = map (lambda x: 'title ' + (str(x)), title_test.columns.values)

title_all = pd.concat((title_train, title_test),axis=0)
title_all.index = range(title_all.shape[0])

dt_all = dt_all.join(title_all)

del title_train, title_test, title_all

In [None]:
## look at feature names
##

pd.set_option('display.max_rows', 300)
print feat_name
pd.set_option('display.max_rows', 50)

In [None]:
## generate bool index columns with special words:

special_words = [u"руковод",u"медиа",u"маркетин",u"автомоб",u"машин",u"юрист",u"бухгалт",u"страхова",u"менедж",u"продаж",u"реклам"]
special_words = special_words + [u'инженер',u'анлгийск',u'english',u'лаборат',u'исследов',u'медицин',u'программ',u'риск','финанс']
special_words = special_words + [u'закуп',u'снабжен',u'кассир',u'дизайнер',u'архитектур',u'строитель',u'туризм',u'логист']
special_words = special_words + [u'страхов']


for i_word in special_words:
    new_col = np.zeros(dt_all.shape[0])
    tt = map (lambda x: re.search(i_word, x.lower())==None, dt_all['job-name'])
    tt = ~np.array(tt)
    new_col[tt] = 1
    col_name = "job-name " + i_word
    dt_all[col_name] = new_col
    
    new_col = np.zeros(dt_all.shape[0])
    tt = map (lambda x: re.search(i_word, x.lower())==None, dt_all['description'])
    tt = ~np.array(tt)
    new_col[tt] = 1
    col_name = "description " + i_word
    dt_all[col_name] = new_col    

dt_all.head()

In [None]:
progs = [u"1с бухгалтерия",u"office",u"MS office",u"excel",u"word","outlook",u'autocad',u'Adobe Illustrator']
progs = progs + [u'Photoshop', u'Corel Draw', u'PHP', u'КАСКО', u'ОСАГО', u'ДГО']


tt =re.search(u"1с бухгалтерия".lower(), dt_all.loc[test_index[i],"description"].lower())
print tt

for i_word in progs:
    new_col = np.zeros(dt_all.shape[0])
    tt = map (lambda x: re.search(i_word.lower(), x.lower())==None, dt_all['job-name'])
    tt = ~np.array(tt)
    new_col[tt] = 1
    col_name = "job-name " + i_word.lower()
    dt_all[col_name] = new_col
    
    new_col = np.zeros(dt_all.shape[0])
    tt = map (lambda x: re.search(i_word.lower(), x.lower())==None, dt_all['description'])
    tt = ~np.array(tt)
    new_col[tt] = 1
    col_name = "description " + i_word.lower()
    dt_all[col_name] = new_col    

dt_all.head()

In [None]:
X = dt_all.copy(deep = True)
print X.shape
print dt_all.shape

ids = dt_all['id']
X = X.drop(['id','job-name','description'], axis = 1)


In [None]:
np.sum(y_class == 1)

In [None]:
## Changing values in y_class to class numbers and lists
## i.e. from [0,1,0,1,0,0,1 ... ] to [2,4,7,..]
##

def makePredictionList(Ypred):
    Ypred = np.vstack(Ypred)
    Ypred = np.transpose(Ypred)
    Ypred = map (lambda tt: Ypred[tt,Ypred[tt,:]!=0].astype(int).tolist(), range(Ypred.shape[0])) 
    return Ypred


y_class_vals = pd.DataFrame(index=range(y_class.shape[0]))
Ytrue = []

for i_col in range(y_class.shape[1]):
    col_name = y_class.columns.values[i_col]
    new_col = np.zeros(y_class.shape[0])
    new_col[np.array(y_class.loc[:,col_name]==1)] = i_col + 1
    Ytrue.append(new_col)

Ytrue = makePredictionList (Ytrue)


In [None]:
## Count number of categories for each id
##

num_cat_list = map (lambda tt: len(Ytrue[tt]), range(len(Ytrue)))

for k in np.unique(num_cat_list):
    print k, sum(num_cat_list==k)

In [None]:
import operator

## sorting probas and return top_2
def getTopClassByProba (prob_list, top_num):
    
    tt = prob_list
    tt = zip(np.arange(1,len(tt)+1),tt)
    top_2 = sorted(tt,key=operator.itemgetter(1),reverse=True)[0:top_num]
    top_2 = pd.DataFrame(top_2)[0].tolist()
    return top_2

In [None]:
## based on Yprob matrix and ytrue real classes return threshold to maximize f1 statistic
##

def getBestProbVals(Yproba, y_true):
    num_class = y_true.shape[1]

    prob_arr = np.zeros(num_class)
    fmax_arr = np.zeros(num_class)

    for t_cat in range(num_class):
        Ytrainproba = pd.DataFrame(Yproba)
        col_prob = Yproba.iloc[:,t_cat]
        fmax = 0
        prob_best = 0
        for p_prob in np.arange(0,1,0.01):
            col_class = np.array(col_prob>p_prob).astype(int)
            col_f1_score = f1_score(y_true.iloc[:,t_cat], col_class)
            if col_f1_score > fmax:
                prob_best = p_prob
                fmax = col_f1_score
            ## print p_prob, col_f1_score

        ## print prob_best, fmax
        prob_arr[t_cat] = prob_best
        fmax_arr[t_cat] = fmax

    return prob_arr, fmax_arr 
 

In [None]:
## convert predicted probas (y_prob) to classes if they are greater then predefined values in prob_arr 
##

def probasToClasses(y_prob, prob_arr):
    class_filt = y_prob>prob_arr
    classes = np.arange(1,len(y_prob)+1)[class_filt].tolist()
    return classes


##map (lambda x: probasToClasses(Yproba[x,],prob_arr) ,range(Yproba.shape[0]))

In [None]:
###  Making data classification:
###
###  1. doing standard xgboost classification on each class separately
###  2. Using class probas to estimate effective probability threshold for each class.
###  3. Estimating classes from threshold
###  4. If set of classes is empty for eany id then best two classes are used
###


Ypred  = []
Yproba = []
Ytrainproba = []


train_test_split = StratifiedShuffleSplit(y_class.iloc[:,0], n_iter = 1, test_size = 0.3)

for train_index, test_index in train_test_split:
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train_all, y_test_all = y_class.iloc[train_index,:], y_class.iloc[test_index,:]
    
    
    for t_cat in  range(y_class.shape[1]):

        y_train, y_test = y_train_all.iloc[:,t_cat], y_test_all.iloc[:,t_cat]

#         clf = ExtraTreesClassifier (n_estimators= 200, min_samples_leaf=5)
#         clf = RandomForestClassifier (n_estimators= 200, min_samples_leaf=5, n_jobs = -1)

        clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=310, 
                                learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=4242)
        
        clf.fit(X_train,y_train)
        y_train_proba = clf.predict_proba(X_train)
        
        y_proba = clf.predict_proba(X_test)
        y_pred = y_proba[:,1]>0.2
        y_pred = y_pred.astype(int)
        y_pred [y_pred==1] = t_cat+1
        
        Ypred.append(y_pred)
        Yproba.append(y_proba[:,1])
        Ytrainproba.append(y_train_proba[:,1])
    
    Yproba = np.vstack(Yproba)
    Yproba = np.transpose(Yproba)
    
    Ytrainproba = np.vstack(Ytrainproba)
    Ytrainproba = np.transpose(Ytrainproba)
    Ytrainproba = pd.DataFrame(Ytrainproba)
    
    prob_arr, fmax_arr = getBestProbVals(Ytrainproba, y_train_all)
    Ypred = map (lambda x: probasToClasses(Yproba[x,],prob_arr) ,range(Yproba.shape[0]))
    
    # replace null answers with top 2
    for k in range(len(Ypred)):
        if len(Ypred[k]) == 0:
            Ypred[k] = getTopClassByProba (Yproba[k,:], 2)
    
    
    ## Ypred = map (lambda x: getTopClassByProba(Yproba[x,:], 2),range(Yproba.shape[0]))

    Ytrue_test = np.array(Ytrue)[test_index].tolist()

    print mean_f1 (Ytrue_test,Ypred)     

In [None]:
Ypred  = []

for t_cat in  range(y_class.shape[1]):
    
    y_train, y_test = y_train_all.iloc[:,t_cat], y_test_all.iloc[:,t_cat]
    
    clf = RandomForestClassifier (n_estimators= 200, min_samples_leaf=5, n_jobs = -1)
    clf.fit (Ytrainproba, y_train)
    y_pred = clf.predict(Yproba)
    y_pred [y_pred==1] = t_cat+1
    Ypred.append(y_pred)
    Ynew_proba = clf.predict_proba(Yproba)
    

Ypred = makePredictionList(Ypred)

Ynew_proba = np.vstack(Ynew_proba)
Ynew_proba = np.transpose(Ynew_proba)

for k in range(len(Ypred)):
    if len(Ypred[k]) == 0:
        Ypred[k] = getTopClassByProba (Yproba[k,:], 2)
        
print mean_f1 (Ytrue_test,Ypred) 

In [None]:
print len(Ytrue_test)
print len(Ypred)

# print zip(np.arange(1,len(Yproba[i])),prob_arr)
# print probasToClasses(Yproba[0,:],prob_arr)
print ""

for i in range(20):
    print Ytrue_test[i],Ypred[i],zip(np.arange(1,len(Yproba[i])),Yproba[i])

In [None]:
i = 188
print Ytrue_test[i],Ypred[i],zip(np.arange(1,len(Yproba[i])+1),Yproba[i])
test_index[i]

In [None]:
print dt_all.loc[test_index[i],"job-name"]
print dt_all.loc[test_index[i],"description"]

In [None]:
y_class.iloc[test_index[i],:]

In [None]:
print dt_all.loc[test_index[i],"description"].lower()