# NOTEBOOK TO STUDY NA's ON USERS

Lets check NAs in Users and try to fill somehow

### Load users

In [1]:
## First load main entities - load users

import numpy as np
import pandas as pd
import time

#start timer
t_start = time.time()

userData=pd.read_table('datasets/users.csv',header=0,dtype=object,nrows=10000000)
userData.columns = ['id','jobroles','career_level','discipline_id','industry_id','country','region',\
                    'experience_n_entries_class','experience_years_experience','experience_years_in_current',\
                    'edu_degree','edu_fieldofstudies','wtcj','premium']

userData['id']=userData['id'].astype(np.int32)
userData['jobroles']=userData['jobroles'].apply(lambda x: [np.int32(i) for i in x.split(',')])
userData['career_level']=userData['career_level'].astype(np.int8)
userData['discipline_id']=userData['discipline_id'].astype(np.int8)
userData['industry_id']=userData['industry_id'].astype(np.int8)
userData['region']=userData['region'].astype(np.int8)
userData['experience_n_entries_class']=userData['experience_n_entries_class'].astype(np.int8)
userData['experience_years_experience']=userData['experience_years_experience'].astype(np.int8)
userData['experience_years_in_current']=userData['experience_years_in_current'].astype(np.int8)
userData['edu_degree']=userData['edu_degree'].astype(np.int8)
userData['edu_fieldofstudies']=userData['edu_fieldofstudies'].apply(lambda x: [] if pd.isnull(x) else [np.int32(i) for i in x.split(',')])
userData['wtcj']=userData['wtcj'].astype(np.int8)
userData['premium']=userData['premium'].astype(np.int8)

#userData=userData.set_index('id')

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

# Show structure
print userData.shape

Time invested 20.003000021 s
(1497020, 14)


### Let's remember user structure

    [id] anonymized ID of the user (referenced as user_id in the other datasets above)
    jobroles comma-separated list of jobrole terms (numeric IDs) that were extracted from the user's current job titles
    career_level career level ID (e.g. beginner, experienced, manager):
        0 = unknown
        1 = Student/Intern
        2 = Entry Level (Beginner)
        3 = Professional/Experienced
        4 = Manager (Manager/Supervisor)
        5 = Executive (VP, SVP, etc.)
        6 = Senior Executive (CEO, CFO, President)
    [discipline_id] anonymized IDs represent disciplines such as "Consulting", "HR", etc.
    [industry_id] anonymized IDs represent industries such as "Internet", "Automotive", "Finance", etc.
    [country] describes the country in which the user is currently working
        de = Germany
        at = Austria
        ch = Switzerland
        non dach = non of the above countries
    [region] is specified for some users who have as country de. Meaning of the regions see below
    [experience_n_entries_class] identifies the number of CV entries that the user has listed as work experiences
        0 = no entries
        1 = 1
        2 entries
        2 = 3
        4 entries
        3 = 5 or more entries
    [experience_years_experience] is the estimated number of years of work experience that the user has
        0 = unknown
        1 = less than 1 year
        2 = 1 - 3 years
        3 = 3 - 5 years
        4 = 5 - 10 years
        5 = 10 - 20 years
        6 = more than 20 years
    [experience_years_in_current] is the estimated number of years that the user is already working in her current job. Meaning of numbers: same as experience_years_experience
    [edu_degree] estimated university degree of the user
        0 or NULL = unknown
        1 = bachelor
        2 = master
        3 = phd
    [edu_fieldofstudies] comma
    separated fields of studies that the user studied. 0 means "unknown" and edu_fieldofstudies > 0 entries refer to broad field of studies such as Engineering, Economics and Legal, ...
    [wtcj] an estimation regarding the user's willingness to change jobs
        0 XING predicts the user has a low interest of changing her job soon
        1 XING predicts the user has a high interest in changing her current position
    [premium] the user subscribed to XING's payed premium membership
        0 no subscription
        1 active subscription


### Some quick checks on NAs

In [2]:
print userData.isnull().values.any()

print userData['career_level'].unique()
print "Num. user with no career_level",len(userData[userData['career_level']==0])

print userData['discipline_id'].unique()
print "Num. user with no discipline_id",len(userData[userData['discipline_id']==0])

print userData['industry_id'].unique()
print "Num. user with no industry_id",len(userData[userData['industry_id']==0])

False
[3 0 6 5 4 1 2]
Num. user with no career_level 537021
[ 0  4 17 23 11 21  1 14 15 16 22  5  8 18  6 19  2  7 13  3  9 10 20 12]
Num. user with no discipline_id 1115980
[ 0 15  4 16  6 12  5 14  3 20 22  9  7 11  8 17  2 13 18 23  1 19 21 10]
Num. user with no industry_id 486175


In [7]:
print len(userData[userData['career_level']!=0])
print len(userData)
print 1.0*len(userData[userData['career_level']!=0])/len(userData)
userData[userData['career_level']!=0].groupby(['career_level'])['career_level'].size()

959999
1497020
0.641273329682


career_level
1     10213
2     58386
3    478680
4    228961
5     76512
6    107247
Name: career_level, dtype: int64

In [8]:
print len(userData[userData['discipline_id']!=0])
print len(userData)
print 1.0*len(userData[userData['discipline_id']!=0])/len(userData)
userData[userData['discipline_id']!=0].groupby(['discipline_id'])['discipline_id'].size()

381040
1497020
0.254532337577


discipline_id
1     11850
2      4819
3     27950
4     50388
5     61599
6     22102
7      5333
8      7555
9      4000
10     5013
11    12999
12      422
13    12368
14     6121
15     5686
16    26183
17    59672
18    15070
19     1744
20     3447
21    17016
22     5576
23    14127
Name: discipline_id, dtype: int64

In [9]:
print len(userData[userData['industry_id']!=0])
print len(userData)
print 1.0*len(userData[userData['industry_id']!=0])/len(userData)
userData[userData['industry_id']!=0].groupby(['industry_id'])['industry_id'].size()

1010845
1497020
0.675238139771


industry_id
1      17276
2      28451
3      83591
4      30225
5      29927
6      61371
7     117766
8      49003
9      14956
10     14930
11     12392
12     19585
13     17865
14     28825
15     68461
16    156798
17     55906
18     22284
19     19353
20    112508
21     22057
22     13553
23     13762
Name: industry_id, dtype: int64

### Build feature vector to rebuild discipline_id

Let's assume we can build a LSI/LDA based on jobroles bag of words (plus other fields of user that may be related)

In [3]:
userData.head(10)

Unnamed: 0,id,jobroles,career_level,discipline_id,industry_id,country,region,experience_n_entries_class,experience_years_experience,experience_years_in_current,edu_degree,edu_fieldofstudies,wtcj,premium
0,30,[2551922],3,0,0,de,0,1,3,1,2,[2],1,0
1,50,"[4375874, 3415336, 2152789, 1431010]",3,4,15,de,7,3,4,1,2,"[5, 8]",1,0
2,70,"[851763, 2070276]",3,17,4,de,2,2,7,2,0,[],1,0
3,90,"[2139882, 2177068, 1520218, 3113130, 399936]",0,0,0,de,0,1,0,0,0,[],0,0
4,100,"[233434, 3142896, 3836967, 987884]",3,0,16,non_dach,0,3,5,5,0,[],0,0
5,120,"[3152052, 4009156]",3,0,6,de,1,3,4,3,0,[3],1,0
6,130,[0],0,0,0,de,0,3,4,0,2,"[2, 5]",1,0
7,150,"[519876, 302644, 821798]",3,0,12,de,9,2,5,3,2,[8],1,1
8,170,[4099592],0,0,16,de,1,3,6,2,0,[],1,0
9,180,[2967883],6,23,5,ch,0,3,6,2,2,[3],1,0


### First, we build the lda model (plus the dictionary, plus the "Corpus", ...)

- We choose 50 topics but dont really know best option
- Best option is to use LdaMulticore
- Best option is to increase passes parameter (but using 10 I need around 3h, so no way to use much more) --> It's important as LDA is somehow random, and lots of passes seems to "agregate" in more stable model


In [78]:
import gensim
from gensim import corpora

#start timer
t_start = time.time()                               

docs=userData['jobroles'].apply(lambda row: map(str,row)) 

#end timer
t_end=time.time()
print "Time invested (docs)"+str((t_end-t_start))+" s"

#start timer
t_start = time.time()                               

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary=corpora.Dictionary(docs)
print(dictionary)
dictionary.save("na/recsys_dict_v2.pkl")

#end timer
t_end=time.time()
print "Time invested (create dictionary)"+str((t_end-t_start))+" s"

#start timer
t_start = time.time()   

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix=[dictionary.doc2bow(doc) for doc in docs] ## equal to "corpus"??

#end timer
t_end=time.time()
print "Time invested (create doc_term_matrix)"+str((t_end-t_start))+" s"



Time invested (docs)10.0169999599 s
Dictionary(12677 unique tokens: [u'4114496', u'2790498', u'4512151', u'2942918', u'3591036']...)
Time invested (create dictionary)18.6909999847 s
Time invested (create doc_term_matrix)14.2119998932 s


### Here we do the incremetal LDA build (to see progress and even to allow split the building)

In [79]:
# Creating the object for LDA model using gensim library
LdaMulticore=gensim.models.ldamulticore.LdaMulticore

#start timer
t_start = time.time()   

### NOW A LOOP
batch_size=50000
start=0
end=len(docs)
while start<end:
    nextstart=start+batch_size
    print "generating from "+str(start)+" to "+str(nextstart-1)
    #start timer
    t_start_inner=time.time() 
    inner_doc_term_matrix=[dictionary.doc2bow(doc) for doc in docs[start:nextstart]]
    if start==0:
        lda_model=LdaMulticore(inner_doc_term_matrix,num_topics=80,id2word=dictionary,workers=3,passes=25)
    else:
        lda_model.update(inner_doc_term_matrix)
    lda_model.save('na/ldamodel_v2.pkl')
    start=nextstart
    #end timer
    t_end=time.time()
    print "Time invested (this step)"+str((t_end-t_start_inner))+" s"
    
#end timer
t_end=time.time()
print "Time invested (whole lda training)"+str((t_end-t_start))+" s"



generating from 0 to 49999
Time invested (this step)854.621999979 s
generating from 50000 to 99999
Time invested (this step)823.195000172 s
generating from 100000 to 149999
Time invested (this step)816.54399991 s
generating from 150000 to 199999
Time invested (this step)815.621000051 s
generating from 200000 to 249999
Time invested (this step)813.414000034 s
generating from 250000 to 299999
Time invested (this step)836.289999962 s
generating from 300000 to 349999
Time invested (this step)813.648999929 s
generating from 350000 to 399999
Time invested (this step)813.599999905 s
generating from 400000 to 449999
Time invested (this step)812.913000107 s
generating from 450000 to 499999
Time invested (this step)816.351999998 s
generating from 500000 to 549999
Time invested (this step)816.601000071 s
generating from 550000 to 599999
Time invested (this step)845.978999853 s
generating from 600000 to 649999
Time invested (this step)811.725000143 s
generating from 650000 to 699999
Time invested 

### Optional step to reload all LDA assets

- We need to load saved dictionary & lda model
- We need to rebuild the "corpus" (docs & doc_term_matrix) --> we can think how to persist not to rebuild

In [46]:
import gensim
from gensim import corpora, models, similarities

## NOTE: we need user dataframe loaded!!!!

# Reload dictionary & lda model
dictionary = corpora.Dictionary.load('na/recsys_dict_v2.pkl')
print dictionary
lda_model=gensim.models.ldamulticore.LdaMulticore.load("na/ldamodel_v2.pkl")

#start timer
t_start = time.time()  

# rebuild docs
docs=userData['jobroles'].apply(lambda row: map(str,row)) 

#end timer
t_end=time.time()
print "Time invested (docs)"+str((t_end-t_start))+" s"

#start timer
t_start = time.time()                               

# rebuild doc term matrix
doc_term_matrix=[dictionary.doc2bow(doc) for doc in docs] ## equal to "corpus"??

#end timer
t_end=time.time()
print "Time invested (doc term matrix)"+str((t_end-t_start))+" s"

Dictionary(12677 unique tokens: [u'2790498', u'4512151', u'2942918', u'3591036', u'2653966']...)
Time invested (docs)12.7279999256 s
Time invested (doc term matrix)13.868999958 s


### Now we build a dataframe to represent every "doc" in corpus (user) with topic distribution as feature vector

- We can convert "bow" document to the list of topics but that only show the topics with enough probability 

In [3]:
## Let's retrieve the topic for one doc (user)

doc_user=doc_term_matrix[0] ## We build before the doc_term_matrix with doc2bow of every user jobroles

print lda_model[doc_user]

[(41, 0.50625000000000042)]


### According to articles, we can use "inference" operation to recover "tetha" values in the internal model of the LDA and rebuild full topic distribution

See  https://stackoverflow.com/questions/17310933/document-topical-distribution-in-gensim-lda 

In [5]:
#start timer
t_start = time.time() 

#theta, _ = lda_model.inference(doc_term_matrix[0:10000])
theta, _ = lda_model.inference(doc_term_matrix)
# Next one, makes the normalization of the theta values (should sum 1 the distribution of every doc)
## NOTE: long operation --> Any way to optimize?
theta /= theta.sum(axis=1)[:, None]

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

theta

Time invested 1859.60800004 s


array([[ 0.00625   ,  0.00625   ,  0.00625   , ...,  0.00625   ,
         0.00625   ,  0.00625   ],
       [ 0.0025    ,  0.0025    ,  0.0025    , ...,  0.0025    ,
         0.0025    ,  0.0025    ],
       [ 0.00416667,  0.00416667,  0.00416667, ...,  0.00416667,
         0.00416667,  0.00416667],
       ..., 
       [ 0.003125  ,  0.003125  ,  0.003125  , ...,  0.003125  ,
         0.003125  ,  0.003125  ],
       [ 0.00178571,  0.00178571,  0.00178571, ...,  0.00178571,
         0.00178571,  0.00178571],
       [ 0.0015625 ,  0.0015625 ,  0.0015625 , ...,  0.0015625 ,
         0.0015625 ,  0.0015625 ]])

### Let's build a dataframe

In [6]:
import pandas as pd

userFeatures=pd.DataFrame(theta)
# Convert column name to str
userFeatures.columns=map(str,userFeatures.columns)

print userFeatures.shape

## save the dataframe
userFeatures.to_pickle('na/userLdaFeatures_v2.pkl')

userFeatures.head()

(1497020, 80)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,...,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625
1,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,...,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025
2,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,...,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167
3,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,...,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083
4,0.0025,0.0025,0.0025,0.0025,0.4025,0.0025,0.0025,0.0025,0.0025,0.0025,...,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025


### Now complete the user feature dataframe

we can try several options and/or add to the mix other fields that may contribute (we can also add others as industry even if the had lots of NAs)

In [3]:
import pandas as pd

## Optional step to reload Lda features dictionary

userFeatures=pd.read_pickle('na/userLdaFeatures_v2.pkl')

userFeatures.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,...,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625
1,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,...,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025
2,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,...,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167
3,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,...,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083
4,0.0025,0.0025,0.0025,0.0025,0.4025,0.0025,0.0025,0.0025,0.0025,0.0025,...,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025


In [4]:
### Let's select add user_id and Discipline....just for the sake of clarity

#userFeat=pd.concat([userData[['id','discipline_id']][0:100000],userFeatures],axis=1)

## First test onnly with LDA:
#userFeat=pd.concat([userData[['id','discipline_id']],userFeatures],axis=1)

## 2nd test add more features from users:
userFeat=pd.concat([userData[['id','career_level','discipline_id','industry_id','region','edu_degree','experience_n_entries_class'\
                              ,'experience_years_experience','experience_years_in_current','wtcj','premium']],userFeatures],axis=1)

## Next one to validate without LDA --> I did and LAD make improvement
#userFeat=userData[['id','career_level','discipline_id','industry_id','region','edu_degree','experience_n_entries_class'\
#                   ,'experience_years_experience','experience_years_in_current','wtcj','premium']]
                    


'''
[career_level]
[discipline_id]
[industry_id]
[country]
[region]
[experience_n_entries_class]
[experience_years_experience]
[experience_years_in_current]
[edu_degree]
[edu_fieldofstudies]
[wtcj] 
[premium]
'''

print userFeat.shape

userFeat.head()

(1497020, 91)


Unnamed: 0,id,career_level,discipline_id,industry_id,region,edu_degree,experience_n_entries_class,experience_years_experience,experience_years_in_current,wtcj,...,70,71,72,73,74,75,76,77,78,79
0,30,3,0,0,0,2,1,3,1,1,...,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625
1,50,3,4,15,7,2,3,4,1,1,...,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025
2,70,3,17,4,2,0,2,7,2,1,...,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167
3,90,0,0,0,0,0,1,0,0,0,...,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083
4,100,3,0,16,0,0,3,5,5,0,...,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025


### We can work and add (optional) the couuntry and edu_fieldofstudies fields

In [5]:
#start timer
t_start=time.time()

countries=pd.Series(userData['country'].astype('category').cat.codes.values)
countries=pd.DataFrame(countries)
countries.columns=['ctry']
edufields=userData['edu_fieldofstudies'].astype(str).str.strip('[]').str.get_dummies(sep=', ').astype(np.int8)
edufields.columns=['e1','e2','e3','e4','e5','e6','e7','e8','e9']
userFeat=pd.concat([userFeat,countries,edufields],axis=1)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

Time invested 15.5950000286 s


In [6]:
userFeat.head()

Unnamed: 0,id,career_level,discipline_id,industry_id,region,edu_degree,experience_n_entries_class,experience_years_experience,experience_years_in_current,wtcj,...,ctry,e1,e2,e3,e4,e5,e6,e7,e8,e9
0,30,3,0,0,0,2,1,3,1,1,...,2,0,1,0,0,0,0,0,0,0
1,50,3,4,15,7,2,3,4,1,1,...,2,0,0,0,0,1,0,0,1,0
2,70,3,17,4,2,0,2,7,2,1,...,2,0,0,0,0,0,0,0,0,0
3,90,0,0,0,0,0,1,0,0,0,...,2,0,0,0,0,0,0,0,0,0
4,100,3,0,16,0,0,3,5,5,0,...,3,0,0,0,0,0,0,0,0,0


In [7]:
##userFeat=userFeat[0:500000]
userClasif=userFeat[userFeat['discipline_id']!=0].copy()
userClasif=userClasif.reset_index(drop=True)
Y=userClasif['discipline_id']
X=userClasif[userClasif.columns.difference(['id','discipline_id'])]

print X.shape

(381040, 99)


### Let's try several classifiers

In [9]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)

#start timer
t_start=time.time()

## We use 70% as we're going to use later to predict (so use most of data)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=0)

#clf = tree.DecisionTreeClassifier()
clf = RandomForestClassifier(n_estimators=60,n_jobs=3)
#clf = GradientBoostingClassifier(n_estimators=60)
clf = clf.fit(X_train, y_train)

y1 = clf.predict(X_train)
y2 = clf.predict(X_test)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

print float(sum(y1-y_train==0))/len(X_train)*100
print float(sum(y2-y_test==0))/len(X_test)*100

from sklearn.metrics import accuracy_score
print accuracy_score(y_train,y1)
print accuracy_score(y_test,y2)

Time invested 78.0859999657 s
99.2852379247
62.9400237945
0.992852379247
0.629400237945


### Insight/Conclusion

 - It seems we can rebuild without much stress 40% of discipline_id (only with the LDA) 
 - <b>We get 58,3% using rest of fields too</b>
 - Increasing to a LDA of 80 topics we move to 63%
 - I tested not using the LDA and only the other user fields and the prediction moves to 40% (so LDA brings value)
 - It's possible getting the best classifier, plus more data, plus GridSearchCV, plus more features to improve
 - Important to add more features of user to check
 - Question: is it best idea to fill NAs or to add a new feature called estimated_discipline?

#### Let's predict now over all orphan discipline to rebuild

In [10]:
print userFeat.shape

orphanUserClasif=userFeat[userFeat['discipline_id']==0].copy()
#orphanUserClasif=orphanUserClasif.reset_index(drop=True)
#Y=orphanUserClasif['discipline_id']
X=orphanUserClasif[orphanUserClasif.columns.difference(['id','discipline_id'])]

print X.shape

X.head()

(1497020, 101)
(1115980, 99)


Unnamed: 0,0,1,10,11,12,13,14,15,16,17,...,e8,e9,edu_degree,experience_n_entries_class,experience_years_experience,experience_years_in_current,industry_id,premium,region,wtcj
0,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,...,0,0,2,1,3,1,0,0,0,1
3,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,...,0,0,0,1,0,0,0,0,0,0
4,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,0.0025,...,0,0,0,3,5,5,16,0,0,0
5,0.004167,0.004167,0.3375,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,...,0,0,0,3,4,3,6,0,1,1
6,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,...,0,0,2,3,4,0,0,0,0,1


#### Predict over orphan discipline records

In [11]:
#start timer
t_start=time.time()

Y = clf.predict(X)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"


Time invested 74.1010000706 s


#### Add the predicted discipline as "est_descipline_id"

In [12]:
orphanUserClasif['est_discipline_id']=Y
#userData=userData.join()
newUserData=userData.join(orphanUserClasif['est_discipline_id'])
#newUserData[newUserData['est_discipline_id']==None]
newUserData.head()

Unnamed: 0,id,jobroles,career_level,discipline_id,industry_id,country,region,experience_n_entries_class,experience_years_experience,experience_years_in_current,edu_degree,edu_fieldofstudies,wtcj,premium,est_discipline_id
0,30,[2551922],3,0,0,de,0,1,3,1,2,[2],1,0,13.0
1,50,"[4375874, 3415336, 2152789, 1431010]",3,4,15,de,7,3,4,1,2,"[5, 8]",1,0,
2,70,"[851763, 2070276]",3,17,4,de,2,2,7,2,0,[],1,0,
3,90,"[2139882, 2177068, 1520218, 3113130, 399936]",0,0,0,de,0,1,0,0,0,[],0,0,13.0
4,100,"[233434, 3142896, 3836967, 987884]",3,0,16,non_dach,0,3,5,5,0,[],0,0,17.0


#### Now, reuse known discipline_id in the est_discipline_id

In [13]:
import numpy as np

est_discipline_id=[(i if np.isnan(j) else j) for i,j in zip(newUserData['discipline_id'],newUserData['est_discipline_id'])]
newUserData['est_discipline_id']=est_discipline_id
newUserData.head()


Unnamed: 0,id,jobroles,career_level,discipline_id,industry_id,country,region,experience_n_entries_class,experience_years_experience,experience_years_in_current,edu_degree,edu_fieldofstudies,wtcj,premium,est_discipline_id
0,30,[2551922],3,0,0,de,0,1,3,1,2,[2],1,0,13.0
1,50,"[4375874, 3415336, 2152789, 1431010]",3,4,15,de,7,3,4,1,2,"[5, 8]",1,0,4.0
2,70,"[851763, 2070276]",3,17,4,de,2,2,7,2,0,[],1,0,17.0
3,90,"[2139882, 2177068, 1520218, 3113130, 399936]",0,0,0,de,0,1,0,0,0,[],0,0,13.0
4,100,"[233434, 3142896, 3836967, 987884]",3,0,16,non_dach,0,3,5,5,0,[],0,0,17.0


#### Create new version of our userData dataframe

In [14]:
newUserData.to_pickle('datasets/users_fill_discipline.pkl')

### Test now with XGBoost  (ignored the XGB for the moment)

Some commands are specific to windows (to my installation)

(Note: we havent tried full data and all fields....initially seemed to perform better RandomForest...strange!!)

In [69]:
#https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows?lang=en

## Next ones seem to be window specific (and for my installation)
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-6.3.0-posix-seh-rt_v5-rev2\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

## The next ones should be default?
import xgboost as xgb
import numpy as np

#start timer
t_start=time.time()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=0)

clf=xgb.XGBClassifier(objective='multi:softmax',max_depth=3,n_estimators=25,learning_rate=0.01,nthread=3)
clf = clf.fit(X_train, y_train)

y1 = clf.predict(X_train)
y2 = clf.predict(X_test)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

print float(sum(y1-y_train==0))/len(X_train)*100
print float(sum(y2-y_test==0))/len(X_test)*100

from sklearn.metrics import accuracy_score
print accuracy_score(y_train,y1)
print accuracy_score(y_test,y2)





Time invested 52.8710000515 s
37.8624515859
36.4538704829
0.378624515859
0.364538704829


<font color=red>NOTE: It's pending re-train with all existing data and predict over missing NAs and move the values back to userData dataframe (that will some coding work)</font>

## Let's repeat to rebuild Industry

In [15]:
## We assume previous sections were executed

userFeat2=userFeat##[0:500000] #we reduce a little to speed up 
userFeat2['est_discipline_id']=newUserData['est_discipline_id'] ##### Optional to test

userClasif=userFeat2[userFeat2['industry_id']!=0].copy()
userClasif=userClasif.reset_index(drop=True)
Y=userClasif['industry_id']
X=userClasif[userClasif.columns.difference(['id','industry_id'])]

print X.shape

(1010845, 100)


In [17]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)

#start timer
t_start=time.time()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=0)

#clf = tree.DecisionTreeClassifier()
clf = RandomForestClassifier(n_estimators=60,n_jobs=3)
#clf = GradientBoostingClassifier(n_estimators=40)
clf = clf.fit(X_train, y_train)

y1 = clf.predict(X_train)
y2 = clf.predict(X_test)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

print float(sum(y1-y_train==0))/len(X_train)*100
print float(sum(y2-y_test==0))/len(X_test)*100

from sklearn.metrics import accuracy_score
print accuracy_score(y_train,y1)
print accuracy_score(y_test,y2)

Time invested 461.833999872 s
94.1272180052
55.536942629
0.941272180052
0.55536942629


### Insights:

 - It seems not so good this method to predict industry (even so we get around 32% not using all data)
 - With LDA of 80 topics we move to 35%
 - It may be a good idea to first add "estimated" disciplined....maybe some will improve -> If we do that we move to 55%, but maybe it's not a good idea (it could be overfitting?)


## Let's enrich now userData with estimated industry id

In [18]:
print userFeat2.shape
orphanUserClasif=userFeat2[userFeat2['industry_id']==0].copy()
X=orphanUserClasif[orphanUserClasif.columns.difference(['id','industry_id'])]
print X.shape

#start timer
t_start=time.time()

Y = clf.predict(X)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

orphanUserClasif['est_industry_id']=Y
newUserData=newUserData.join(orphanUserClasif['est_industry_id'])
est_industry_id=[(i if np.isnan(j) else j) for i,j in zip(newUserData['industry_id'],newUserData['est_industry_id'])]
newUserData['est_industry_id']=est_industry_id

newUserData.to_pickle('datasets/users_fill_discipline_industry.pkl')

(1497020, 102)
(486175, 100)
Time invested 38.8699998856 s


### Let's try too asses the possibiity to rebuild career_level

In [19]:
## We assume previous sections were executed

#userFeat3=userFeat ## Depend if we want to reuse predicted ones
userFeat3=userFeat2#[0:700000] #we reduce a little to speed up 
userFeat3['est_industry_id']=newUserData['est_industry_id'] ##### Optional to test
userClasif=userFeat3[userFeat3['career_level']!=0].copy()
userClasif=userClasif.reset_index(drop=True)
Y=userClasif['career_level']
X=userClasif[userClasif.columns.difference(['id','career_level'])]

print X.shape

(959999, 101)


In [20]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)

#start timer
t_start=time.time()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=0)

#clf = tree.DecisionTreeClassifier()
clf = RandomForestClassifier(n_estimators=60,n_jobs=3)
#clf = GradientBoostingClassifier(n_estimators=40)
clf = clf.fit(X_train, y_train)

y1 = clf.predict(X_train)
y2 = clf.predict(X_test)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

print float(sum(y1-y_train==0))/len(X_train)*100
print float(sum(y2-y_test==0))/len(X_test)*100

from sklearn.metrics import accuracy_score
print accuracy_score(y_train,y1)
print accuracy_score(y_test,y2)

Time invested 339.145999908 s
70.8361111111
0.708361111111


### Insight

- We predict around 60% in career level
- With LDA of 80 topics move to 62%
- Using too estimated discipline and industry mvoes to 68% (but overfit??)
- And we can study to include estimated disciplined_id (and/or estimated industry) plus use GridSearchCV and even other better classifiers (GradientBoostings?)

#### Let's enrich user data with estimated career level

In [21]:
print userFeat3.shape
orphanUserClasif=userFeat3[userFeat3['career_level']==0].copy()
X=orphanUserClasif[orphanUserClasif.columns.difference(['id','career_level'])]
print X.shape

#start timer
t_start=time.time()

Y = clf.predict(X)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

orphanUserClasif['est_career_level']=Y
newUserData=newUserData.join(orphanUserClasif['est_career_level'])
est_industry_id=[(i if np.isnan(j) else j) for i,j in zip(newUserData['career_level'],newUserData['est_career_level'])]
newUserData['est_career_level']=est_industry_id

newUserData.to_pickle('datasets/users_fill_discipline_industry_career.pkl')

(1497020, 103)
(537021, 101)
Time invested 16.4530000687 s


### Do the experiment to predict again discipline with new estimated industry and career level (will overfit???)

In [22]:
userFeat4=userFeat3 
userFeat4['est_career_level']=newUserData['est_career_level'] 
userClasif=userFeat4[userFeat4['discipline_id']!=0].copy()
userClasif=userClasif.reset_index(drop=True)
Y=userClasif['discipline_id']
X=userClasif[userClasif.columns.difference(['id','discipline_id','est_discipline_id'])] ## remove first est discipline_id

print X.shape

(381040, 101)


In [24]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)

#start timer
t_start=time.time()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=0)

#clf = tree.DecisionTreeClassifier()
clf = RandomForestClassifier(n_estimators=60,n_jobs=3)
#clf = GradientBoostingClassifier(n_estimators=40)
clf = clf.fit(X_train, y_train)

y1 = clf.predict(X_train)
y2 = clf.predict(X_test)

#end timer
t_end=time.time()
print "Time invested "+str((t_end-t_start))+" s"

print float(sum(y1-y_train==0))/len(X_train)*100
print float(sum(y2-y_test==0))/len(X_test)*100

from sklearn.metrics import accuracy_score
print accuracy_score(y_train,y1)
print accuracy_score(y_test,y2)

Time invested 80.2869999409 s
99.2858554297
63.5313877808
0.992858554297
0.635313877808


#### Same results!!!! No really kno why :)

Different order combinations in prediction will make difference? 