In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

from keras.layers import Dense, Flatten, Input
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras import Sequential
import keras
from sklearn.preprocessing import OneHotEncoder

from keras.layers import Dropout
import h5py  # compress and save features

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Loading cleaned data

In [2]:
train = pd.read_csv('salary_train_fixed.csv')
train.head()

Unnamed: 0,Category,ContractTime,ContractType,FullDescription,Id,SalaryNormalized,SalaryRaw,company,location_1,location_2,location_3,location_4,location_5,location_6,sourcename,title
0,8,1,0,Engineering Systems Analyst Dorking Surrey Sal...,12612628,25000,20000 - 30000/annum 20-30K,912,1,0,53,201,353,462,42,0
1,8,1,0,Stress Engineer Glasgow Salary **** to **** We...,12612830,30000,25000 - 35000/annum 25-35K,912,1,0,0,0,962,29,42,0
2,8,1,0,Mathematical Modeller / Simulation Analyst / O...,12612844,30000,20000 - 40000/annum 20-40K,912,1,0,0,0,1004,23,42,0
3,8,1,0,Engineering Systems Analyst / Mathematical Mod...,12613049,27500,25000 - 30000/annum 25K-30K negotiable,912,1,0,0,185,1066,11,42,0
4,8,1,0,"Pioneer, Miser Engineering Systems Analyst Do...",12613647,25000,20000 - 30000/annum 20-30K,912,1,0,0,185,1066,11,42,0


In [2]:
test = pd.read_csv('salary_test_fixed.csv')
test.head()

Unnamed: 0,Category,ContractTime,ContractType,FullDescription,Id,SalaryNormalized,SalaryRaw,company,location_1,location_2,location_3,location_4,location_5,location_6,sourcename,title
0,26,1,0,The Company: Our client is a national training...,11888454,,,1256,1,3,63,135,1219,22,42,1
1,3,1,0,The Company: Founded in **** our client is a U...,11988350,,,1256,1,3,63,135,1219,22,42,495
2,8,1,0,Engineering Systems Analysts Surrey ****K Loca...,12612558,,,912,1,0,0,185,1066,11,42,0
3,8,1,0,CIS Systems Engineering Consultant Bristol So...,12613014,,,912,1,0,54,183,177,10,42,0
4,17,1,0,"CNC Miller / Programmer Fanac Fleet, Hampshire...",22454872,,,912,1,0,38,152,250,276,42,0


In [39]:
test = pd.read_csv('salary_test_fixed.csv')
test.head()

Unnamed: 0,Category,ContractTime,ContractType,FullDescription,Id,SalaryNormalized,SalaryRaw,company,location_1,location_2,location_3,location_4,location_5,location_6,sourcename,title
0,26,1,0,The Company: Our client is a national training...,11888454,,,1256,1,3,63,135,1219,22,42,1
1,3,1,0,The Company: Founded in **** our client is a U...,11988350,,,1256,1,3,63,135,1219,22,42,495
2,8,1,0,Engineering Systems Analysts Surrey ****K Loca...,12612558,,,912,1,0,0,185,1066,11,42,0
3,8,1,0,CIS Systems Engineering Consultant Bristol So...,12613014,,,912,1,0,54,183,177,10,42,0
4,17,1,0,"CNC Miller / Programmer Fanac Fleet, Hampshire...",22454872,,,912,1,0,38,152,250,276,42,0


** Full description to vector**

In [41]:
full_des = pd.read_csv('./dataset/salay_first_dataset.csv',low_memory=False)['FullDescription']

In [42]:
# integer encode the documents 独热编码
vocab_size = 2000
encoded_docs = [one_hot(d, vocab_size) for d in tqdm(full_des)]

100%|████████████████████████████████████████████████████████████████████████| 367231/367231 [01:04<00:00, 5710.49it/s]


In [43]:
# pad documents to a max length of max words
maxlength = len(max(encoded_docs, key=len))
padded_docs = pad_sequences(encoded_docs, maxlen = maxlength, padding='post')
padded_docs.shape

(367231, 2081)

In [44]:
# normallized
padded_docs = padded_docs/vocab_size

In [45]:
with h5py.File('features_fulldes_all.h5', 'w') as f:
    f['fulldes'] = padded_docs

In [46]:
fullDes = padded_docs[122462:367230]
print(fullDes.shape)

(244768, 2081)


In [47]:
del padded_docs, encoded_docs, full_des

In [48]:
with h5py.File('features_fulldes_train.h5', 'w') as f:
    f['fulldes_train'] = fullDes

** Train - other features to one-hot vector**

In [3]:
Category = train['Category']
Category.shape

(244768,)

In [4]:
category_onehot = keras.utils.to_categorical(Category, len(set(Category))) 
category_onehot.shape

(244768, 29)

In [5]:
ContractTime = train['ContractTime']
ContractTime.shape

(244768,)

In [6]:
ContractTime_onehot = keras.utils.to_categorical(ContractTime, len(set(ContractTime))) 
ContractTime_onehot.shape

(244768, 3)

In [7]:
ContractType = train['ContractType']
ContractType.shape

(244768,)

In [8]:
ContractType_onehot = keras.utils.to_categorical(ContractType, len(set(ContractType))) 
ContractType_onehot.shape

(244768, 3)

In [9]:
location_1 = train['location_1']
location_1.shape

(244768,)

In [10]:
location_1_onehot = keras.utils.to_categorical(location_1, len(set(location_1))) 
location_1_onehot.shape

(244768, 2)

In [11]:
location_2 = train['location_2']
location_2.shape

(244768,)

In [12]:
location_2_onehot = keras.utils.to_categorical(location_2, len(set(location_2))) 
location_2_onehot.shape

(244768, 12)

In [13]:
location_3 = train['location_3']
location_3.shape

(244768,)

In [14]:
location_3_onehot = keras.utils.to_categorical(location_3, 74) 
location_3_onehot.shape

(244768, 74)

In [15]:
location_4 = train['location_4']
location_4.shape

(244768,)

In [16]:
location_4_onehot = keras.utils.to_categorical(location_4, 241) 
location_4_onehot.shape

(244768, 241)

In [17]:
location_5 = train['location_5']
location_5.shape

(244768,)

In [18]:
location_5_onehot = keras.utils.to_categorical(location_5, 1239) 
location_5_onehot.shape

(244768, 1239)

In [19]:
location_6 = train['location_6']
location_6.shape

(244768,)

In [20]:
location_6_onehot = keras.utils.to_categorical(location_6, len(set(location_6))) 
location_6_onehot.shape

(244768, 2088)

In [21]:
company = train['company']
company.shape

(244768,)

In [22]:
company_onehot = keras.utils.to_categorical(company, len(set(company))) 
company_onehot.shape

(244768, 3038)

In [23]:
sourcename = train['sourcename']
sourcename.shape

(244768,)

In [24]:
sourcename_onehot = keras.utils.to_categorical(sourcename, 169) 
sourcename_onehot.shape

(244768, 169)

In [25]:
title = train['title']
title.shape

(244768,)

In [26]:
title_onehot = keras.utils.to_categorical(title, 2838) 
title_onehot.shape

(244768, 2838)

# Combine features

In [33]:
# salay data
salary = train['SalaryNormalized']
print(salary.shape)

(244768,)


In [None]:
features = np.concatenate([title_onehot, fullDes, location_1_onehot, location_2_onehot, location_3_onehot,
                          location_4_onehot, location_5_onehot, location_6_onehot, ContractType_onehot,
                          ContractTime_onehot, company_onehot, category_onehot, sourcename_onehot], axis=-1) # with FullDescription
features.shape

In [28]:
features = np.concatenate([title_onehot, location_1_onehot, location_2_onehot, location_3_onehot,
                          location_4_onehot, location_5_onehot, location_6_onehot, ContractType_onehot,
                          ContractTime_onehot, company_onehot, category_onehot, sourcename_onehot], axis=-1) # without FullDescription
features.shape

(244768, 9736)

In [31]:
with h5py.File('features_train.h5', 'w') as f:
    f['features'] = features

In [38]:
with h5py.File('features_salary.h5', 'w') as f:
    f['salary'] = salary

In [34]:
#free memeroy
del train

In [27]:
#free memeroy
del Category,ContractTime,ContractType,location_1,location_2,location_3,location_4,location_5,location_6,company,sourcename,title

In [30]:
#free memeroy
del category_onehot,ContractTime_onehot,ContractType_onehot,location_1_onehot,location_2_onehot,location_3_onehot,location_4_onehot,location_5_onehot,location_6_onehot,company_onehot,sourcename_onehot,title_onehot

** Test - other features to one-hot vector**

In [3]:
Category = test['Category']

category_onehot = keras.utils.to_categorical(Category, len(set(Category))) 

ContractTime = test['ContractTime']

ContractTime_onehot = keras.utils.to_categorical(ContractTime, len(set(ContractTime))) 

ContractType = test['ContractType']

ContractType_onehot = keras.utils.to_categorical(ContractType, len(set(ContractType))) 

In [7]:
location_1 = test['location_1']
location_1_onehot = keras.utils.to_categorical(location_1, len(set(location_1))) 

location_2 = test['location_2']
location_2_onehot = keras.utils.to_categorical(location_2, len(set(location_2))) 

location_3 = test['location_3']
location_3_onehot = keras.utils.to_categorical(location_3, 74) 

location_4 = test['location_4']
location_4_onehot = keras.utils.to_categorical(location_4, 241) 

location_5 = test['location_5']
location_5_onehot = keras.utils.to_categorical(location_5, 1239) 

location_6 = test['location_6']
location_6_onehot = keras.utils.to_categorical(location_6, 2088) 

In [17]:
company = test['company']
company_onehot = keras.utils.to_categorical(company, len(set(company))) 

sourcename = test['sourcename']
sourcename_onehot = keras.utils.to_categorical(sourcename, 169) 

title = test['title']
title_onehot = keras.utils.to_categorical(title, 2838) 

# Combine features

In [22]:
features = np.concatenate([title_onehot, location_1_onehot, location_2_onehot, location_3_onehot,
                          location_4_onehot, location_5_onehot, location_6_onehot, ContractType_onehot,
                          ContractTime_onehot, company_onehot, category_onehot, sourcename_onehot], axis=-1) # without FullDescription
features.shape

(122463, 9736)

In [23]:
with h5py.File('features_test.h5', 'w') as f:
    f['features_test'] = features