In [133]:
import re

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
 

from lxml import etree
from lxml import objectify

import nltk
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from multiprocessing import Pool

N_PROCS = 8

In [134]:
filename_str = "Data/train.xml.gz"

parser = etree.XMLParser(ns_clean=True, encoding='utf-8')
data_train_tree = etree.parse(filename_str,parser)
root = data_train_tree.getroot()  

len(root.getchildren())
# pd.DataFrame(zip(map (etree.tostring, root.getchildren()[0:1000]),
#                                                     np.array(range(1000))))

10000

In [141]:
##   Reading XML data

def parse_tree_struct((xml_root, k)):

    dt_row = dict()   ## for storing rows
    dt_industr =  dict()  ## for storing industries
    
    b_childs = etree.fromstring(xml_root)
    dt_row['id'] = b_childs.get("id")
    b_childs = b_childs.getchildren()
    

    for tree_elem in b_childs:
        tag_val = tree_elem.tag
        num_val = tree_elem.text
        dt_row[tag_val] = num_val

        if tree_elem.tag == 'term':
            new_tag = tree_elem.getchildren()[0].tag
            dt_row[new_tag] = tree_elem.getchildren()[0].text

        if tree_elem.tag == 'requirement':
            new_tag = tree_elem.getchildren()[0].tag
            dt_row[new_tag] = tree_elem.getchildren()[0].text

        if tree_elem.tag == 'industries':
            for ind_elem in tree_elem.getchildren():
                dt_industr [ind_elem.text] = 1

    dt_row = pd.Series(dt_row)
    dt_row['y_industr'] = dt_industr
    
    return dt_row 

def read_data_gz(filename_str):
    
    # initiate parser
    parser = etree.XMLParser(ns_clean=True, encoding='utf-8')
    data_train_tree = etree.parse(filename_str,parser)

    # set up column names
    col_names = ['id','update-date','industries','job-name','salary','currency','employment','schedule','description','experience','contract','y_industr']

    root = data_train_tree.getroot()  
    root_data_size = len(root.getchildren())
    
    ## processing xml data in pool
    pool = Pool(processes = N_PROCS)
    dt_data = pool.map(parse_tree_struct, zip(map (etree.tostring, root.getchildren()),
                                                    np.array(range(root_data_size))))
    pool.close()
    pool.join()
        
    dt_data = pd.DataFrame(dt_data,columns=col_names)    
    tt = map (lambda x: pd.Series(dict(x)), dt_data['y_industr'])
    y_train = pd.DataFrame(tt)
    
    dt_data = dt_data.drop('y_industr',axis = 1)
    ## dt_train = dt_train.append(dt_row, ignore_index=True)
    ## y_train = y_train.append(dt_industr, ignore_index=True)
        
    return dt_data, y_train ##dt_train, y_train
    
dt_train, y_train = read_data_gz ("Data/train.xml.gz")
dt_test, y_test = read_data_gz ("Data/test.xml.gz")

print "data loaded"

data loaded


In [142]:
dt_train.head()

Unnamed: 0,id,update-date,industries,job-name,salary,currency,employment,schedule,description,experience,contract
0,174673,2015-08-18 00:31:14 GMT+4,\n,Бетонщик,,,Полная занятость,Полный день,<p>В Компанию с центральным офисом в Москве (в...,1-3 года,постоянный
1,130222,2015-08-19 00:15:58 GMT+4,\n,Руководитель интернет-проекта (обувь),от 50000 до 70000,RUR,Полная занятость,Полный день,<p><strong>Должностные обязанности:</strong></...,3-6 лет,постоянный
2,121331,2015-08-19 01:44:02 GMT+4,\n,Врач-офтальмолог,,,Полная занятость,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,3-6 лет,постоянный
3,30170,2015-08-20 11:29:29 GMT+4,\n,Специалист аналитического отдела,,,Проектная/Временная работа,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,1-3 года,постоянный
4,43059,2015-08-20 11:01:03 GMT+4,\n,Руководитель проекта лаборатории нефтехимическ...,,,Полная занятость,Полный день,<p>Руководитель проекта лаборатории нефтехимич...,1-3 года,постоянный


In [143]:
# joining all data
dt_all = pd.concat((dt_train,dt_test),axis=0)
dt_all.index = range (dt_all.shape[0])

print max(dt_all.index)

19999


In [144]:
## removing nans from y_train
y_train [np.isnan(y_train)] = 0

In [145]:
dt_all = dt_all.drop(['industries','currency','update-date'], axis=1)

In [146]:
# s - string salary
def process_salary(salary_str):
    l_bound = np.nan
    h_bound = np.nan
    try:
        m = re.search(u'от ([0-9]*)', salary_str)
        if m: 
            l_bound = float(m.group(1))
        m = re.search(u'до ([0-9]*)', salary_str)
        if m: 
            h_bound = float(m.group(1))
    except:
        pass
    return l_bound, h_bound  


salary_hl = pd.DataFrame(map(process_salary, dt_all['salary']),columns=['low_sal','high_sal'])

print dt_all.shape
print salary_hl.shape

dt_all = dt_all.join(salary_hl)
dt_all = dt_all.drop('salary', axis = 1)

(20000, 8)
(20000, 2)


In [147]:
## replacing nans with approximations
mean_rel = np.mean(dt_all['high_sal']/dt_all['low_sal'])

null_rows = np.isnan(dt_all['low_sal'])
dt_all.loc[null_rows,'low_sal'] = dt_all['high_sal'][null_rows] / mean_rel

null_rows = np.isnan(dt_all['high_sal'])
dt_all.loc[null_rows,'high_sal'] = dt_all['low_sal'][null_rows] * mean_rel


## replacing all missing values with zeroes
dt_all['no_sal_data'] = 0
null_rows = np.isnan(dt_all['high_sal'])
dt_all.loc[null_rows,'no_sal_data'] = 1
dt_all.loc[null_rows,'low_sal'] = 0
dt_all.loc[null_rows,'high_sal'] = 0

In [148]:
dt_all.head()

Unnamed: 0,id,job-name,employment,schedule,description,experience,contract,low_sal,high_sal,no_sal_data
0,174673,Бетонщик,Полная занятость,Полный день,<p>В Компанию с центральным офисом в Москве (в...,1-3 года,постоянный,0.0,0.0,1
1,130222,Руководитель интернет-проекта (обувь),Полная занятость,Полный день,<p><strong>Должностные обязанности:</strong></...,3-6 лет,постоянный,50000.0,70000.0,0
2,121331,Врач-офтальмолог,Полная занятость,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,3-6 лет,постоянный,0.0,0.0,1
3,30170,Специалист аналитического отдела,Проектная/Временная работа,Полный день,<p><strong>Обязанности:</strong></p> <ul> <li>...,1-3 года,постоянный,0.0,0.0,1
4,43059,Руководитель проекта лаборатории нефтехимическ...,Полная занятость,Полный день,<p>Руководитель проекта лаборатории нефтехимич...,1-3 года,постоянный,0.0,0.0,1


In [149]:
## Creating one-hot features for category columns

def make_bin_cols(table_data):
   pass 

col_names = ['employment','schedule','experience','contract']
dt_bin = pd.DataFrame()
dt = dt_all[col_names]
j = 0

for j in range(dt.shape[1]):
    for row_val in np.unique(dt.iloc[:,j]):
        new_col = np.zeros(dt.shape[0])   
        new_col[np.array(dt.iloc[:,j] == row_val)] = 1
        col_name = dt.columns.values[j] + "_" + row_val
        dt_bin[col_name] = new_col
        
dt_all = dt_all.join(dt_bin)
dt_all = dt_all.drop(col_names, axis =1 )
##pd.Series(np.unique(dt_all['employment']))

In [150]:
def clean_str(mystr):
    text = re.sub(u"<.*?>","", mystr)
    text = re.sub(u"&quot;","", text)
    return text
    
dt_all['description'] = map(clean_str, dt_all['description'])
dt_all['job-name'] = map(clean_str, dt_all['job-name'])


In [151]:
dt_all.head()

Unnamed: 0,id,job-name,description,low_sal,high_sal,no_sal_data,employment_Полная занятость,employment_Проектная/Временная работа,employment_Стажировка,employment_Частичная занятость,schedule_Вахтовый метод,schedule_Гибкий график,schedule_Полный день,schedule_Сменный график,schedule_Удаленная работа,experience_1-3 года,experience_3-6 лет,experience_Более 6 лет,experience_Нет опыта,contract_постоянный
0,174673,Бетонщик,В Компанию с центральным офисом в Москве (вход...,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,130222,Руководитель интернет-проекта (обувь),Должностные обязанности: Управление ИНТЕРНЕТ ...,50000.0,70000.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,121331,Врач-офтальмолог,Обязанности: проведение полной диагностики со...,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,30170,Специалист аналитического отдела,Обязанности: Обработка больших массивов инфор...,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,43059,Руководитель проекта лаборатории нефтехимическ...,Руководитель проекта лаборатории нефтехимическ...,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [152]:
##  nltk.download("punkt")
##
##  http://www.markhneedham.com/blog/2015/02/15/pythonscikit-learn-calculating-tfidf-on-how-i-met-your-mother-transcripts/
##

def tokenize(text):
    stemmer = SnowballStemmer("russian")
    text = re.sub(u"[^a-zA-Z0-9а-яА-Я -]","",text)
    text = re.sub(u"--","",text) 
    text = re.sub(u"($-)|( -)|(-^)","",text)
    text = re.sub(u" 000",u"000",text)

    tokens = nltk.word_tokenize(text)
    stems = []
    
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems


def make_tfidf_decomposition (train_dt, test_dt, pca_size):

    rus_stopwords = stopwords.words("russian")

    tfidf = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 5), min_df=15, stop_words = rus_stopwords)

    tfs_train = tfidf.fit_transform(train_dt)
    tfs_test  = tfidf.transform(test_dt)

    pca_tf = PCA(n_components = pca_size)
    tfs_train = pd.DataFrame(pca_tf.fit_transform (tfs_train.todense()))
    tfs_test  = pd.DataFrame(pca_tf.transform (tfs_test.todense()))
    
    feature_names = pd.Series(tfidf.get_feature_names())

    return tfs_train, tfs_test, feature_names

# pd.set_option('display.max_rows', 1000)
# print feature_names
# pd.set_option('display.max_rows', 100)

In [153]:
descr_train, descr_test, feat_name = make_tfidf_decomposition (dt_all['description'][0:1000], dt_all['description'][1000:2000], pca_size = 200)

descr_train.columns = map (lambda x: 'descr ' + (str(x)), descr_train.columns.values)
descr_test.columns  = map (lambda x: 'descr ' + (str(x)), descr_test.columns.values)

descr_all = pd.concat((descr_train,descr_test),axis=0)
dt_all = dt_all.join(descr_all)

del descr_train, descr_test, descr_all

In [154]:
title_train, title_test, feat_name = make_tfidf_decomposition (dt_all['job-name'][0:1000], dt_all['job-name'][1000:2000], pca_size = 30)

title_train.columns = map (lambda x: 'title ' + (str(x)), title_train.columns.values)
title_test.columns  = map (lambda x: 'title ' + (str(x)), title_test.columns.values)

title_all = pd.concat((title_train, title_test),axis=0)
dt_all = dt_all.join(title_all)

del title_train, title_test, title_all

In [155]:
X = dt_all.copy(deep = True)
print X.shape
print dt_all.shape

(23000, 250)
(23000, 250)


In [156]:
## look at feature names
##

pd.set_option('display.max_rows', 3000)
print feat_name
pd.set_option('display.max_rows', 100)

0                     агент
1             администратор
2                 бухгалтер
3                     ведущ
4                     главн
5                     групп
6                  директор
7                   инженер
8                    кассир
9                    клиент
10                        м
11                  магазин
12                 менеджер
13          менеджер продаж
14           менеджер работ
15    менеджер работ клиент
16             мерчендайзер
17                начальник
18                    отдел
19             отдел продаж
20             представител
21     продавец-консультант
22                   продаж
23                   проект
24                    работ
25             работ клиент
26                   развит
27               региональн
28      региональн менеджер
29              руководител
30               специалист
31                    старш
32                   торгов
33      торгов представител
34                 финансов
dtype: object


In [157]:
ids = dt_all['id']
X = X.drop(['id','job-name','description'], axis = 1)

In [158]:
X.head()

Unnamed: 0,low_sal,high_sal,no_sal_data,employment_Полная занятость,employment_Проектная/Временная работа,employment_Стажировка,employment_Частичная занятость,schedule_Вахтовый метод,schedule_Гибкий график,schedule_Полный день,...,title 20,title 21,title 22,title 23,title 24,title 25,title 26,title 27,title 28,title 29
0,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.021529,0.010153,0.038994,-0.072299,-0.010667,0.029637,-0.003615,-0.003033,-0.000129,-0.006611
0,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.06973,-0.534233,0.004862,-0.046774,-0.20765,0.027178,0.122384,-0.00668,0.000626,-0.009482
0,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.021529,0.010153,0.038994,-0.072299,-0.010667,0.029637,-0.003615,-0.003033,-0.000129,-0.006611
0,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.06973,-0.534233,0.004862,-0.046774,-0.20765,0.027178,0.122384,-0.00668,0.000626,-0.009482
1,50000.0,70000.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.021529,0.010153,0.038994,-0.072299,-0.010667,0.029637,-0.003615,-0.003033,-0.000129,-0.006611


In [None]:
N_PROCS = 4

def unwrapLS(arg, **kwarg):
    return CLFLS.oneFit(*arg, **kwarg)

class CLFLS(BaseEstimator, ClassifierMixin):
    def __init__(self, X, y, param, final):
        self.X = X
        self.y = y
        self.param = param        
        self.final = final
        self.clf = []          
    
    def oneFit(self, i):
        if len(set(self.y[:, i])) == 2:
            #return SVC(C = 1e-3, probability = True).fit(self.X, self.y[:, i])
            return LinearSVC(C = self.param).fit(self.X, self.y[:, i])
            #return Ridge().fit(self.X, self.y[:, i])
            #return RidgeClassifier().fit(self.X, self.y[:, i])
            #return BernoulliNB(alpha = 0.01).fit(self.X, self.y[:, i])
            #return LinearRegression().fit(self.X, self.y[:, i])
            #return LogisticRegression().fit(self.X, self.y[:, i])
            #return SGDClassifier(loss = 'perceptron', penalty = 'elasticnet', n_iter = 50).fit(self.X, self.y[:, i])
        
    def fit(self):
        pool = Pool(processes = N_PROCS)
        self.clf = pool.map(unwrapLS, zip([self] * 200, array(range(200))))
        pool.close()
        pool.join()            

    def predict(self, X):
        predicted = []
        for i in range(200):
            if len(set(self.y[:, i])) == 2:
                pred = self.clf[i].decision_function(X)
                #pred = self.clf[i].predict(X)
                #pred = self.clf[i].predict_proba(X)[:, 0]
            else:
                pred = [-10] * shape(X)[0]

            predicted.append(pred)
        del self.clf
        gc.collect()
        
        return array(predicted).T

In [159]:
clf = RandomForestClassifier()

np.sum(y_train == 1)

Автомобильный бизнес                                      289
Административный персонал                                 613
Банки, инвестиции, лизинг                                1081
Безопасность                                              199
Бухгалтерия, управленческий учет, финансы предприятия     946
Высший менеджмент                                         448
Государственная служба, некоммерческие организации         25
Добыча сырья                                              215
Домашний персонал                                         102
Закупки                                                   219
Инсталляция и сервис                                      174
Информационные технологии, интернет, телеком             1380
Искусство, развлечения, масс-медиа                        200
Консультирование                                          260
Маркетинг, реклама, PR                                   1344
Медицина, фармацевтика                                    472
Наука, о