### Skill to vector - 2. Data cleaning

In [1]:
## Import modules
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
import progressbar as pp
import re
from tqdm import tqdm
import nltk
from nltk.corpus imeport stopwords
from nltk.tokenize import word_tokenize
import random
import pickle

In [2]:
## load data - 1 million in total
job_description_df = pd.read_csv("/mnt/c/Users/ruihao/Documents/Data/job_description_all_1m.csv")

In [4]:
## pre-analysis
## extract english job descriptions - 400k in total

# filter out short job description (<100 words)
job_description_en_df = pd.DataFrame(job_description_df[job_description_df.language_id==1]["job_description"].dropna())
job_description_en_df = job_description_en_df[job_description_en_df["job_description"].apply(lambda x: len(x.split())>100)]

In [20]:
## An example
n = random.randint(0, len(job_description_en_df))
example = job_description_en_df.iloc[n].values[0]
print(example)

![](https://l.gpcdn.pl/3/30/306/306063/_res/top.jpg)

.Net Developer (Full Stack)

Location: Gdańsk  

The challenges waiting for you:

  

For our international customer – specializing in producing computer hardware -
we are currently looking for an experienced .NET Developer.

The project concerns the development of web application for hardware usage
tracking. On each setup there is an application that collects and sends data
to serwers. In this web application users can display information that they
are interested in. Another part of project will be development of a system
responsible for tracking physical location of hardware



Responsibilities:

  * Developing of web based frontend solutions using JS, Angular
  * Developing with MSSQL
  * Implementing unit testing as natural part of the development process (xUnit)

Required skills:

  

  * At least 1-2 years of commercial experience
  * C# .NET 4.6+, MVC
  * JS, Angular
  * Bootstrap, MS SQL, Entity Framework
  * Experience with

In [19]:
## Pre-pocessing for the example

def paragraph_segment(text):
    text = re.sub(r"[\(\>]http.+?(\n.+?)*[\)\>]", " ", text, re.MULTILINE) ## remove url
    text = re.split("\n[\s]*\n", text) ## segment with double newlines
    text = [re.sub(r"[^\w\&]", " ", x) for x in text] ## remove special characters
    
    text = [re.sub(r"[\s]+", " ", x.strip()) for x in text] ## remove spaces
    text = list(filter(lambda x: len(x.split())>2, text)) ## remove short sentences
    return text

paragraph_segment(example)

['Send me Jobs like this',
 'Assist clients in their recruitment and hiring through Online job portal',
 'Provide Product & Recruitment Training to Clients',
 'Engage with key clients and building long term client relationships to generate repeat business and referrals',
 '_Salary _ INR 2 50 000 4 50 000 P A',
 '_Industry _ Internet Ecommerce',
 '_Functional Area _ Sales Retail Business Development',
 '_Role Category _ Channel Sales',
 '_Role _ Client Relationship Manager',
 '_Employment Type _ Permanent Job Full Time',
 'Client Relationship Manager client relationship management jobs Find all jobs matching Client Relationship Manager client servicing client servicing jobs Find all jobs matching client servicing client management client management jobs Find all jobs matching client management Client Relation client relation jobs Find all jobs matching Client Relation clients retention client retention jobs Find all jobs matching clients retention Recruitment recruitment jobs Find all j

In [13]:
clean_text = []
for text in pp.progressbar(job_description_en): 
    clean_text.append(clean_str(text))

100% (403472 of 403472) |################| Elapsed Time: 0:02:51 Time:  0:02:51


In [None]:
## save clean data
with open('clean_text_single_gram', 'wb') as ct:
    pickle.dump(clean_text, ct)

In [16]:
print(clean_text[-10:])

['Principle Objective To achieve and or exceed sales plan by creating and maintaining successful Tom Ford Beauty By Kilian and Editions de Parfums Frederic Malle Specialist teams. Create strategic partnerships with retail partners to grow the brands. Responsibilities include selection development and retention of Sales Education Coordinators and counter staff administration of point of sale initiatives and management of the account at a store level. Accountabilities 1 Lead and Execute Tom Ford Beauty By Kilian and Editions de Parfums Frederic Malle Initiatives Work with Brand Manager to plan and achieve retail sales goals by door. Communicate and cascade goals objectives and priorities to sales education coordinators counter business managers and specialists on a weekly basis. Create store- specific action plans to achieve counter goals and objectives. Collaborate with sales and education coordinators to ensure execution of those plans. Plan and execute new product launches and special

In [22]:
## train word2vec without specify skills
import gensim

tokenized_text = []
for text in pp.progressbar(clean_text):
    tokenized_text.append(gensim.utils.simple_preprocess(text))
    
print(tokenized_text[:10])

100% (170060 of 170060) |################| Elapsed Time: 0:02:16 Time:  0:02:16


[['wealth', 'management', 'credit', 'risk', 'analyst', 'wealth', 'management', 'wm', 'provides', 'advice', 'strategies', 'and', 'solutions', 'for', 'all', 'aspects', 'of', 'the', 'financial', 'asset', 'management', 'and', 'wealth', 'transfer', 'needs', 'of', 'high', 'net', 'worth', 'and', 'ultra', 'high', 'net', 'worth', 'clients', 'wm', 'is', 'dedicated', 'to', 'delivering', 'an', 'outstanding', 'client', 'experience', 'by', 'providing', 'unbiased', 'advice', 'and', 'individual', 'solutions', 'credit', 'risk', 'assesses', 'permissions', 'and', 'manages', 'credit', 'and', 'counterparty', 'risks', 'on', 'an', 'industry', 'client', 'geographic', 'and', 'transaction', 'basis', 'credit', 'risk', 'is', 'the', 'risk', 'of', 'loss', 'arising', 'from', 'the', 'default', 'of', 'client', 'or', 'counterparty', 'the', 'credit', 'risk', 'function', 'identifies', 'measures', 'limits', 'manages', 'and', 'monitors', 'credit', 'risk', 'across', 'our', 'businesses', 'credit', 'exposure', 'arises', 'thro

In [25]:
model = gensim.models.Word2Vec(tokenized_text, size=500, window=10, min_count=5)
model.save("word2vec.model")

In [29]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.Word2Vec.load("word2vec.model")
model.train(tokenized_text, total_examples=len(tokenized_text), epochs=10)

2018-09-12 11:52:50,569 : INFO : loading Word2Vec object from word2vec.model
2018-09-12 11:52:50,945 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2018-09-12 11:52:50,947 : INFO : loading vectors from word2vec.model.wv.vectors.npy with mmap=None
2018-09-12 11:52:51,243 : INFO : setting ignored attribute vectors_norm to None
2018-09-12 11:52:51,305 : INFO : loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
2018-09-12 11:52:51,307 : INFO : loading trainables recursively from word2vec.model.trainables.* with mmap=None
2018-09-12 11:52:51,310 : INFO : loading syn1neg from word2vec.model.trainables.syn1neg.npy with mmap=None
2018-09-12 11:52:51,737 : INFO : setting ignored attribute cum_table to None
2018-09-12 11:52:51,742 : INFO : loaded word2vec.model
2018-09-12 11:52:51,873 : INFO : training model with 3 workers on 60417 vocabulary and 500 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2018-09-12 11:52:52,924 : INFO : E

2018-09-12 11:53:57,632 : INFO : EPOCH 1 - PROGRESS: at 56.35% examples, 542140 words/s, in_qsize 6, out_qsize 0
2018-09-12 11:53:58,657 : INFO : EPOCH 1 - PROGRESS: at 57.10% examples, 540841 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:53:59,686 : INFO : EPOCH 1 - PROGRESS: at 57.88% examples, 539708 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:54:00,710 : INFO : EPOCH 1 - PROGRESS: at 58.50% examples, 538529 words/s, in_qsize 4, out_qsize 1
2018-09-12 11:54:01,722 : INFO : EPOCH 1 - PROGRESS: at 59.07% examples, 537514 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:54:02,725 : INFO : EPOCH 1 - PROGRESS: at 59.73% examples, 536594 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:54:03,736 : INFO : EPOCH 1 - PROGRESS: at 60.46% examples, 536183 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:54:04,753 : INFO : EPOCH 1 - PROGRESS: at 61.22% examples, 536277 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:54:05,768 : INFO : EPOCH 1 - PROGRESS: at 61.89% examples, 535909 words/s, in_qsiz

2018-09-12 11:55:08,307 : INFO : EPOCH 2 - PROGRESS: at 8.52% examples, 511783 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:55:09,318 : INFO : EPOCH 2 - PROGRESS: at 9.45% examples, 512673 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:55:10,321 : INFO : EPOCH 2 - PROGRESS: at 10.28% examples, 511842 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:55:11,331 : INFO : EPOCH 2 - PROGRESS: at 11.13% examples, 510745 words/s, in_qsize 6, out_qsize 0
2018-09-12 11:55:12,358 : INFO : EPOCH 2 - PROGRESS: at 12.01% examples, 506572 words/s, in_qsize 4, out_qsize 1
2018-09-12 11:55:13,374 : INFO : EPOCH 2 - PROGRESS: at 12.75% examples, 503319 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:55:14,386 : INFO : EPOCH 2 - PROGRESS: at 13.58% examples, 504440 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:55:15,395 : INFO : EPOCH 2 - PROGRESS: at 14.48% examples, 506774 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:55:16,397 : INFO : EPOCH 2 - PROGRESS: at 15.27% examples, 502968 words/s, in_qsize 

2018-09-12 11:56:22,099 : INFO : EPOCH 2 - PROGRESS: at 65.48% examples, 499878 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:56:23,112 : INFO : EPOCH 2 - PROGRESS: at 66.30% examples, 499582 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:56:24,114 : INFO : EPOCH 2 - PROGRESS: at 67.07% examples, 499159 words/s, in_qsize 6, out_qsize 0
2018-09-12 11:56:25,125 : INFO : EPOCH 2 - PROGRESS: at 67.91% examples, 498878 words/s, in_qsize 6, out_qsize 0
2018-09-12 11:56:26,130 : INFO : EPOCH 2 - PROGRESS: at 68.53% examples, 497572 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:56:27,148 : INFO : EPOCH 2 - PROGRESS: at 69.25% examples, 496842 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:56:28,174 : INFO : EPOCH 2 - PROGRESS: at 69.96% examples, 496226 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:56:29,178 : INFO : EPOCH 2 - PROGRESS: at 70.61% examples, 495626 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:56:30,182 : INFO : EPOCH 2 - PROGRESS: at 71.22% examples, 495136 words/s, in_qsiz

2018-09-12 11:57:32,141 : INFO : EPOCH 3 - PROGRESS: at 21.37% examples, 521461 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:33,144 : INFO : EPOCH 3 - PROGRESS: at 22.32% examples, 522410 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:34,164 : INFO : EPOCH 3 - PROGRESS: at 23.29% examples, 523775 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:35,174 : INFO : EPOCH 3 - PROGRESS: at 24.12% examples, 523277 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:36,199 : INFO : EPOCH 3 - PROGRESS: at 24.96% examples, 521253 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:37,207 : INFO : EPOCH 3 - PROGRESS: at 25.87% examples, 520976 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:38,219 : INFO : EPOCH 3 - PROGRESS: at 26.64% examples, 520238 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:39,221 : INFO : EPOCH 3 - PROGRESS: at 27.51% examples, 520384 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:57:40,237 : INFO : EPOCH 3 - PROGRESS: at 28.41% examples, 521660 words/s, in_qsiz

2018-09-12 11:58:45,993 : INFO : EPOCH 3 - PROGRESS: at 79.96% examples, 517967 words/s, in_qsize 5, out_qsize 1
2018-09-12 11:58:46,994 : INFO : EPOCH 3 - PROGRESS: at 80.80% examples, 518545 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:47,999 : INFO : EPOCH 3 - PROGRESS: at 81.62% examples, 519178 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:49,002 : INFO : EPOCH 3 - PROGRESS: at 82.38% examples, 519633 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:50,012 : INFO : EPOCH 3 - PROGRESS: at 83.16% examples, 519831 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:51,045 : INFO : EPOCH 3 - PROGRESS: at 83.99% examples, 519996 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:52,045 : INFO : EPOCH 3 - PROGRESS: at 84.82% examples, 520380 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:53,058 : INFO : EPOCH 3 - PROGRESS: at 85.78% examples, 520637 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:58:54,059 : INFO : EPOCH 3 - PROGRESS: at 86.72% examples, 520822 words/s, in_qsiz

2018-09-12 11:59:56,603 : INFO : EPOCH 4 - PROGRESS: at 36.96% examples, 517285 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:59:57,614 : INFO : EPOCH 4 - PROGRESS: at 37.77% examples, 517646 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:59:58,619 : INFO : EPOCH 4 - PROGRESS: at 38.61% examples, 518178 words/s, in_qsize 5, out_qsize 0
2018-09-12 11:59:59,648 : INFO : EPOCH 4 - PROGRESS: at 39.51% examples, 519544 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:00:00,667 : INFO : EPOCH 4 - PROGRESS: at 40.28% examples, 519356 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:00:01,681 : INFO : EPOCH 4 - PROGRESS: at 41.18% examples, 520129 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:00:02,685 : INFO : EPOCH 4 - PROGRESS: at 42.05% examples, 520259 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:00:03,710 : INFO : EPOCH 4 - PROGRESS: at 42.99% examples, 520894 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:00:04,720 : INFO : EPOCH 4 - PROGRESS: at 43.91% examples, 521508 words/s, in_qsiz

2018-09-12 12:01:10,574 : INFO : EPOCH 4 - PROGRESS: at 91.25% examples, 490471 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:01:11,596 : INFO : EPOCH 4 - PROGRESS: at 91.65% examples, 488687 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:01:12,604 : INFO : EPOCH 4 - PROGRESS: at 92.23% examples, 487666 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:01:13,618 : INFO : EPOCH 4 - PROGRESS: at 92.78% examples, 486434 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:01:14,638 : INFO : EPOCH 4 - PROGRESS: at 93.39% examples, 485827 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:01:15,641 : INFO : EPOCH 4 - PROGRESS: at 94.03% examples, 485252 words/s, in_qsize 4, out_qsize 0
2018-09-12 12:01:16,645 : INFO : EPOCH 4 - PROGRESS: at 94.68% examples, 484312 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:01:17,666 : INFO : EPOCH 4 - PROGRESS: at 95.28% examples, 483380 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:01:18,696 : INFO : EPOCH 4 - PROGRESS: at 96.02% examples, 483132 words/s, in_qsiz

2018-09-12 12:02:20,461 : INFO : EPOCH 5 - PROGRESS: at 44.76% examples, 512374 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:02:21,502 : INFO : EPOCH 5 - PROGRESS: at 45.54% examples, 511236 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:02:22,517 : INFO : EPOCH 5 - PROGRESS: at 46.37% examples, 510605 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:02:23,544 : INFO : EPOCH 5 - PROGRESS: at 47.11% examples, 509964 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:02:24,549 : INFO : EPOCH 5 - PROGRESS: at 47.82% examples, 509194 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:02:25,563 : INFO : EPOCH 5 - PROGRESS: at 48.60% examples, 509084 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:02:26,582 : INFO : EPOCH 5 - PROGRESS: at 49.41% examples, 508091 words/s, in_qsize 6, out_qsize 1
2018-09-12 12:02:27,596 : INFO : EPOCH 5 - PROGRESS: at 50.12% examples, 506750 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:02:28,633 : INFO : EPOCH 5 - PROGRESS: at 50.60% examples, 503255 words/s, in_qsiz

2018-09-12 12:03:34,610 : INFO : EPOCH 5 - PROGRESS: at 96.11% examples, 475702 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:03:35,633 : INFO : EPOCH 5 - PROGRESS: at 96.81% examples, 475508 words/s, in_qsize 3, out_qsize 1
2018-09-12 12:03:36,659 : INFO : EPOCH 5 - PROGRESS: at 97.30% examples, 474820 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:03:37,663 : INFO : EPOCH 5 - PROGRESS: at 97.95% examples, 474514 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:03:38,680 : INFO : EPOCH 5 - PROGRESS: at 98.76% examples, 474619 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:03:39,686 : INFO : EPOCH 5 - PROGRESS: at 99.43% examples, 474215 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:03:40,581 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-12 12:03:40,585 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-12 12:03:40,601 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-12 12:03:40,604 : INFO : EPOCH - 5 : trai

2018-09-12 12:04:45,414 : INFO : EPOCH 6 - PROGRESS: at 49.99% examples, 489212 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:04:46,436 : INFO : EPOCH 6 - PROGRESS: at 50.69% examples, 487919 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:47,445 : INFO : EPOCH 6 - PROGRESS: at 51.56% examples, 487987 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:48,451 : INFO : EPOCH 6 - PROGRESS: at 52.42% examples, 488184 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:49,461 : INFO : EPOCH 6 - PROGRESS: at 53.13% examples, 488436 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:50,462 : INFO : EPOCH 6 - PROGRESS: at 54.02% examples, 489081 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:51,469 : INFO : EPOCH 6 - PROGRESS: at 54.93% examples, 489602 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:52,483 : INFO : EPOCH 6 - PROGRESS: at 55.70% examples, 489967 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:04:53,488 : INFO : EPOCH 6 - PROGRESS: at 56.46% examples, 489892 words/s, in_qsiz

2018-09-12 12:05:55,968 : INFO : EPOCH 7 - PROGRESS: at 5.48% examples, 553333 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:05:56,989 : INFO : EPOCH 7 - PROGRESS: at 6.45% examples, 553005 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:05:57,998 : INFO : EPOCH 7 - PROGRESS: at 7.32% examples, 551686 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:05:59,010 : INFO : EPOCH 7 - PROGRESS: at 8.21% examples, 551434 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:06:00,058 : INFO : EPOCH 7 - PROGRESS: at 9.15% examples, 547876 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:06:01,068 : INFO : EPOCH 7 - PROGRESS: at 10.04% examples, 544226 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:06:02,072 : INFO : EPOCH 7 - PROGRESS: at 10.80% examples, 537405 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:06:03,074 : INFO : EPOCH 7 - PROGRESS: at 11.69% examples, 534724 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:06:04,084 : INFO : EPOCH 7 - PROGRESS: at 12.51% examples, 529488 words/s, in_qsize 5, 

2018-09-12 12:07:09,983 : INFO : EPOCH 7 - PROGRESS: at 61.98% examples, 495338 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:10,984 : INFO : EPOCH 7 - PROGRESS: at 62.61% examples, 495036 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:11,997 : INFO : EPOCH 7 - PROGRESS: at 63.46% examples, 495507 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:13,021 : INFO : EPOCH 7 - PROGRESS: at 64.38% examples, 495967 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:14,025 : INFO : EPOCH 7 - PROGRESS: at 65.18% examples, 496321 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:15,032 : INFO : EPOCH 7 - PROGRESS: at 66.01% examples, 496661 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:16,053 : INFO : EPOCH 7 - PROGRESS: at 66.79% examples, 496380 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:07:17,069 : INFO : EPOCH 7 - PROGRESS: at 67.75% examples, 496734 words/s, in_qsize 6, out_qsize 1
2018-09-12 12:07:18,096 : INFO : EPOCH 7 - PROGRESS: at 68.57% examples, 496696 words/s, in_qsiz

2018-09-12 12:08:21,067 : INFO : EPOCH 8 - PROGRESS: at 16.09% examples, 474746 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:08:22,097 : INFO : EPOCH 8 - PROGRESS: at 16.59% examples, 467710 words/s, in_qsize 4, out_qsize 1
2018-09-12 12:08:23,105 : INFO : EPOCH 8 - PROGRESS: at 17.04% examples, 461422 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:08:24,121 : INFO : EPOCH 8 - PROGRESS: at 17.51% examples, 456267 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:08:25,124 : INFO : EPOCH 8 - PROGRESS: at 18.20% examples, 457499 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:08:26,125 : INFO : EPOCH 8 - PROGRESS: at 18.94% examples, 459008 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:08:27,129 : INFO : EPOCH 8 - PROGRESS: at 19.67% examples, 459710 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:08:28,157 : INFO : EPOCH 8 - PROGRESS: at 20.32% examples, 459200 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:08:29,168 : INFO : EPOCH 8 - PROGRESS: at 21.15% examples, 460031 words/s, in_qsiz

2018-09-12 12:09:34,793 : INFO : EPOCH 8 - PROGRESS: at 69.05% examples, 468751 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:35,851 : INFO : EPOCH 8 - PROGRESS: at 69.73% examples, 468096 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:36,857 : INFO : EPOCH 8 - PROGRESS: at 70.32% examples, 466990 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:37,894 : INFO : EPOCH 8 - PROGRESS: at 70.90% examples, 466774 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:38,906 : INFO : EPOCH 8 - PROGRESS: at 71.61% examples, 466896 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:39,918 : INFO : EPOCH 8 - PROGRESS: at 72.32% examples, 466543 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:40,939 : INFO : EPOCH 8 - PROGRESS: at 73.02% examples, 466499 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:41,971 : INFO : EPOCH 8 - PROGRESS: at 73.74% examples, 466021 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:09:42,994 : INFO : EPOCH 8 - PROGRESS: at 74.30% examples, 464780 words/s, in_qsiz

2018-09-12 12:10:45,318 : INFO : EPOCH 9 - PROGRESS: at 20.57% examples, 503626 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:46,331 : INFO : EPOCH 9 - PROGRESS: at 21.44% examples, 503721 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:47,332 : INFO : EPOCH 9 - PROGRESS: at 22.36% examples, 504711 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:48,336 : INFO : EPOCH 9 - PROGRESS: at 23.23% examples, 504972 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:49,336 : INFO : EPOCH 9 - PROGRESS: at 24.05% examples, 505291 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:50,368 : INFO : EPOCH 9 - PROGRESS: at 24.99% examples, 505421 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:51,373 : INFO : EPOCH 9 - PROGRESS: at 25.87% examples, 505461 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:52,381 : INFO : EPOCH 9 - PROGRESS: at 26.64% examples, 505045 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:10:53,410 : INFO : EPOCH 9 - PROGRESS: at 27.48% examples, 504282 words/s, in_qsiz

2018-09-12 12:11:59,238 : INFO : EPOCH 9 - PROGRESS: at 78.19% examples, 505000 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:00,242 : INFO : EPOCH 9 - PROGRESS: at 78.97% examples, 505304 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:01,249 : INFO : EPOCH 9 - PROGRESS: at 79.67% examples, 505468 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:02,257 : INFO : EPOCH 9 - PROGRESS: at 80.43% examples, 505866 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:03,271 : INFO : EPOCH 9 - PROGRESS: at 81.18% examples, 506321 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:04,280 : INFO : EPOCH 9 - PROGRESS: at 81.94% examples, 506657 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:05,280 : INFO : EPOCH 9 - PROGRESS: at 82.60% examples, 506020 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:06,286 : INFO : EPOCH 9 - PROGRESS: at 83.25% examples, 505498 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:12:07,290 : INFO : EPOCH 9 - PROGRESS: at 84.05% examples, 505709 words/s, in_qsiz

2018-09-12 12:13:09,759 : INFO : EPOCH 10 - PROGRESS: at 33.32% examples, 507577 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:13:10,764 : INFO : EPOCH 10 - PROGRESS: at 34.02% examples, 507753 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:11,775 : INFO : EPOCH 10 - PROGRESS: at 34.77% examples, 507425 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:12,796 : INFO : EPOCH 10 - PROGRESS: at 35.59% examples, 507985 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:13,799 : INFO : EPOCH 10 - PROGRESS: at 36.37% examples, 508311 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:14,815 : INFO : EPOCH 10 - PROGRESS: at 37.14% examples, 507589 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:15,841 : INFO : EPOCH 10 - PROGRESS: at 37.92% examples, 507480 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:16,854 : INFO : EPOCH 10 - PROGRESS: at 38.73% examples, 507970 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:13:17,858 : INFO : EPOCH 10 - PROGRESS: at 39.54% examples, 508476 words/s

2018-09-12 12:14:23,014 : INFO : EPOCH 10 - PROGRESS: at 81.41% examples, 456509 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:14:24,030 : INFO : EPOCH 10 - PROGRESS: at 81.79% examples, 454951 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:14:25,048 : INFO : EPOCH 10 - PROGRESS: at 82.40% examples, 454672 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:14:26,057 : INFO : EPOCH 10 - PROGRESS: at 83.02% examples, 454439 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:14:27,062 : INFO : EPOCH 10 - PROGRESS: at 83.65% examples, 454261 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:14:28,082 : INFO : EPOCH 10 - PROGRESS: at 84.22% examples, 453607 words/s, in_qsize 5, out_qsize 0
2018-09-12 12:14:29,087 : INFO : EPOCH 10 - PROGRESS: at 84.92% examples, 453668 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:14:30,101 : INFO : EPOCH 10 - PROGRESS: at 85.62% examples, 453193 words/s, in_qsize 6, out_qsize 0
2018-09-12 12:14:31,117 : INFO : EPOCH 10 - PROGRESS: at 86.41% examples, 453112 words/s

(643229407, 779594330)

In [97]:
test_list = ["python", "java", "engineer", "excel", "windows", "supply", "finance", "law", "volkswagen", "microsoft", "ms", "id"]

for word in test_list:
    print(word, ":\n", model.wv.most_similar(word, topn=10))

python :
 [('java', 0.5775786638259888), ('programming', 0.5024352073669434), ('scripting', 0.49300575256347656), ('scala', 0.48204708099365234), ('sql', 0.45646244287490845), ('vba', 0.45345473289489746), ('matlab', 0.4523012042045593), ('linux', 0.44909530878067017), ('erlang', 0.4474000930786133), ('programing', 0.44161373376846313)]
java :
 [('python', 0.5775786638259888), ('scala', 0.492953360080719), ('sql', 0.48523861169815063), ('fspring', 0.4649026095867157), ('web', 0.45784538984298706), ('multithreading', 0.45341354608535767), ('ror', 0.45115312933921814), ('dojo', 0.450187623500824), ('angular', 0.44846874475479126), ('nodejs', 0.4470083713531494)]
engineer :
 [('developer', 0.6307722330093384), ('technician', 0.48629868030548096), ('architect', 0.48340022563934326), ('engineering', 0.47516053915023804), ('analyst', 0.46210309863090515), ('specialist', 0.444033682346344), ('engineers', 0.38512903451919556), ('administrator', 0.38083726167678833), ('manager', 0.3735758066177

In [85]:
print(len(model.wv.vocab))

60417


In [92]:
## get skills with single word

skill_names = pd.read_csv("../SkillMatch/skill_name.csv").name
skill_names = skill_names.unique()

# remove the contents inside () and spaces
keys = []
for key in skill_names:
    key = re.sub(r'\([^)]*\)', '', key)  ## remove the contents inside ()
    key = re.sub(r'\(.*?', '', key)
    key = re.sub(r'\s+$', '', key)
    keys.append(key)

p = re.compile(r'[\s]+')
p.search(keys[0])
single_gram = [key.lower() for key in keys if not p.search(key)]

#print(single_gram)

for n, key in enumerate(single_gram):
    if key in model.wv.vocab:
        print(n, ".", key, ":\n", model.wv.most_similar(key, topn=10))

0 . aeronautics :
 [('compenstaion', 0.3155566453933716), ('oopd', 0.3140553832054138), ('aerospace', 0.29555875062942505), ('falsifying', 0.2882545292377472), ('prefd', 0.28357526659965515), ('wstern', 0.2721080780029297), ('electronical', 0.26889747381210327), ('walkerhamill', 0.2602077126502991), ('proffered', 0.25234779715538025), ('exprience', 0.2514776885509491)]
1 . aircraft :
 [('airframe', 0.44175463914871216), ('flight', 0.3870462477207184), ('overhaul', 0.3856665790081024), ('ships', 0.36923855543136597), ('satellites', 0.3688507378101349), ('airworthiness', 0.36622118949890137), ('uav', 0.36233291029930115), ('compressors', 0.3573465943336487), ('refrigeration', 0.3547694683074951), ('belts', 0.3534882068634033)]
2 . aviation :
 [('lccs', 0.3723999559879303), ('airline', 0.36982518434524536), ('aerospace', 0.35981428623199463), ('marine', 0.3429199457168579), ('ports', 0.3315504491329193), ('aeronautical', 0.3294937312602997), ('aircraft', 0.32152748107910156), ('airports',

57 . honda :
 [('volvo', 0.5496788024902344), ('hyundai', 0.521776556968689), ('vw', 0.47450122237205505), ('bugatti', 0.4211973249912262), ('nissan', 0.41895216703414917), ('jcb', 0.4126511216163635), ('suzuki', 0.40954768657684326), ('maruti', 0.4025025963783264), ('volkswagen', 0.39537572860717773), ('audi', 0.39409542083740234)]
58 . hyundai :
 [('kia', 0.6074517965316772), ('honda', 0.521776556968689), ('showroom', 0.386985719203949), ('volvo', 0.3764187693595886), ('bolands', 0.3735511004924774), ('nissan', 0.35592007637023926), ('vw', 0.3541685938835144), ('glovis', 0.35096919536590576), ('maruti', 0.34807318449020386), ('leadeing', 0.3460739254951477)]
59 . man :
 [('torrent', 0.3536461889743805), ('ahl', 0.3263595998287201), ('sqids', 0.3189420700073242), ('yog', 0.31096428632736206), ('qna', 0.2988274097442627), ('pme', 0.2973634600639343), ('consultingkrew', 0.29704415798187256), ('businesswoman', 0.2964341640472412), ('isle', 0.2949351668357849), ('deprived', 0.276814162731

101 . railways :
 [('switchgears', 0.5768593549728394), ('cement', 0.4459576904773712), ('durables', 0.4212280511856079), ('ites', 0.42110657691955566), ('biotech', 0.4201053977012634), ('hospitals', 0.4147491753101349), ('foods', 0.4028860330581665), ('quarrying', 0.3970138430595398), ('anciliary', 0.3953944146633148), ('garments', 0.3941216468811035)]
102 . service :
 [('satisfaction', 0.4305467903614044), ('services', 0.4258422553539276), ('care', 0.3496975898742676), ('swiftnet', 0.3107963502407074), ('mont', 0.3052257299423218), ('kiara', 0.29868245124816895), ('centricity', 0.2967149019241333), ('consignments', 0.28937816619873047), ('statusing', 0.2847689390182495), ('zgvlcgfrlmpoys', 0.2833629846572876)]
103 . technology :
 [('technologies', 0.4285985231399536), ('infrastructure', 0.3454377055168152), ('cloud', 0.34115153551101685), ('security', 0.34068578481674194), ('enterprise', 0.33151015639305115), ('deltron', 0.3116339445114136), ('systems', 0.3076784312725067), ('digital

156 . clarification :
 [('clarify', 0.2841337025165558), ('clarifications', 0.27905696630477905), ('definition', 0.25283747911453247), ('missing', 0.2491457611322403), ('inform', 0.24457678198814392), ('explanation', 0.23412269353866577), ('review', 0.22891424596309662), ('interpretation', 0.22629690170288086), ('doubt', 0.22561822831630707), ('questions', 0.2241244912147522)]
158 . innovative :
 [('breakthrough', 0.379488468170166), ('innovation', 0.378127783536911), ('creative', 0.3764621615409851), ('our', 0.32733577489852905), ('innovate', 0.30476561188697815), ('unconventional', 0.2956576347351074), ('sustainable', 0.2950995862483978), ('softinent', 0.2923673987388611), ('differentiated', 0.2892257273197174), ('innovations', 0.28916430473327637)]
159 . interpretation :
 [('interpreting', 0.3421461582183838), ('interpret', 0.32221361994743347), ('interpretations', 0.32029542326927185), ('analysis', 0.3025391697883606), ('evaluation', 0.27655354142189026), ('inaccurate', 0.264940559

191 . supervising :
 [('directing', 0.4231695532798767), ('managing', 0.3987138271331787), ('supervises', 0.3973972797393799), ('overseeing', 0.3968190550804138), ('assigning', 0.38381972908973694), ('supervise', 0.37448564171791077), ('coordinating', 0.35340166091918945), ('assisting', 0.3218441903591156), ('instructing', 0.31831756234169006), ('delegating', 0.31199800968170166)]
192 . objective :
 [('goal', 0.4558486342430115), ('purpose', 0.3599151372909546), ('aim', 0.3445310592651367), ('mission', 0.3367312550544739), ('objectives', 0.2915349304676056), ('intention', 0.24958649277687073), ('职位描述', 0.2433479130268097), ('rbpm', 0.21925634145736694), ('intent', 0.2160957157611847), ('remetrica', 0.21517795324325562)]
193 . patience :
 [('empathy', 0.5565260052680969), ('courtesy', 0.392360657453537), ('honesty', 0.36831966042518616), ('tact', 0.3651352524757385), ('humility', 0.3614833354949951), ('professionalism', 0.35592561960220337), ('perseverance', 0.35430729389190674), ('sinc

233 . belgium :
 [('walloon', 0.3044824004173279), ('zaventem', 0.30384770035743713), ('leuven', 0.3023214340209961), ('flemish', 0.2895016074180603), ('luxembourg', 0.2887362241744995), ('jette', 0.28432515263557434), ('ghent', 0.26944148540496826), ('tournai', 0.268685519695282), ('brussels', 0.25918227434158325), ('courtrai', 0.2581943869590759)]
234 . canada :
 [('canadian', 0.4007599651813507), ('vancouver', 0.3814491033554077), ('fisglobal', 0.3326129913330078), ('lq', 0.3245648741722107), ('brazil', 0.320095032453537), ('europe', 0.3183041214942932), ('mexico', 0.31592732667922974), ('montreal', 0.29786425828933716), ('costa', 0.2968124747276306), ('rica', 0.2966163158416748)]
235 . chile :
 [('argentina', 0.4298120141029358), ('globală', 0.4094686806201935), ('brazil', 0.34445133805274963), ('inclusiv', 0.34145262837409973), ('verde', 0.33942386507987976), ('magazinele', 0.33317747712135315), ('amma', 0.3301525115966797), ('ctac', 0.32753750681877136), ('philippines', 0.3246001

268 . teamlead :
 [('项目管理', 0.4431145191192627), ('programador', 0.3622097373008728), ('金属方向', 0.3604380488395691), ('冲压', 0.360312283039093), ('压铸', 0.3564024269580841), ('proiectant', 0.3522951006889343), ('comprador', 0.3474913537502289), ('fipasi', 0.3356752395629883), ('生产主管', 0.3207842707633972), ('engnieer', 0.3031995892524719)]
269 . ipc :
 [('modbus', 0.400274395942688), ('cucm', 0.3940383791923523), ('socket', 0.39277184009552), ('cgms', 0.38683757185935974), ('wlc', 0.3637240529060364), ('infoblox', 0.361044704914093), ('sfi', 0.35859620571136475), ('mimo', 0.35665661096572876), ('lin', 0.35520070791244507), ('uart', 0.33534252643585205)]
276 . hart :
 [('fluke', 0.3448755145072937), ('iizbaa', 0.3387955129146576), ('jsdssb', 0.3264709711074829), ('mariano', 0.3248397707939148), ('aiuvaylpqy', 0.3188033401966095), ('needham', 0.2970353662967682), ('tvnkti', 0.2965925633907318), ('mrrrmhu', 0.29473209381103516), ('worcester', 0.28782975673675537), ('zuckerman', 0.284722268581

341 . arm :
 [('powerpc', 0.3008699417114258), ('générale', 0.29997390508651733), ('société', 0.2881050109863281), ('tricore', 0.27386799454689026), ('socit', 0.2683832347393036), ('cortex', 0.26505786180496216), ('fpgas', 0.2626102864742279), ('boc', 0.2618059515953064), ('fosun', 0.257352352142334), ('longest', 0.25376254320144653)]
342 . freescale :
 [('lauterbach', 0.5514489412307739), ('nxp', 0.4990062117576599), ('rtos', 0.42972517013549805), ('renesas', 0.4131213426589966), ('debugger', 0.41083085536956787), ('fpgas', 0.40495121479034424), ('tricore', 0.3888254761695862), ('vxworks', 0.38824164867401123), ('usb', 0.3877927362918854), ('ti', 0.382171094417572)]
343 . microprocessor :
 [('freescale', 0.33220091462135315), ('cvu', 0.3250911235809326), ('soc', 0.3156239092350006), ('gpu', 0.30888140201568604), ('pcie', 0.280026376247406), ('autosar', 0.2653912305831909), ('xilinx', 0.2605140209197998), ('renesas', 0.254666268825531), ('jtag', 0.2519160211086273), ('sy', 0.2492404580

444 . blackberry :
 [('windows', 0.38771671056747437), ('xenmobile', 0.37450870871543884), ('wlans', 0.3684231638908386), ('sendmail', 0.3658212423324585), ('iphone', 0.3650723099708557), ('desktop', 0.36424094438552856), ('ipad', 0.35570257902145386), ('wdg', 0.3372267186641693), ('directory', 0.3352982997894287), ('macintosh', 0.3350314497947693)]
445 . iphone :
 [('ipad', 0.7113472819328308), ('android', 0.42388617992401123), ('ios', 0.4235610365867615), ('tablet', 0.37772154808044434), ('apple', 0.3755829334259033), ('jni', 0.370078980922699), ('blackberry', 0.3650723099708557), ('ipod', 0.36429810523986816), ('kony', 0.3413775861263275), ('xctest', 0.3317158818244934)]
446 . modem :
 [('router', 0.441471666097641), ('crcm', 0.3437823951244354), ('licensures', 0.3085628151893616), ('ptt', 0.2915927767753601), ('isaca', 0.28977257013320923), ('dsss', 0.2877383232116699), ('cams', 0.2833961844444275), ('ofdm', 0.2784578502178192), ('ccsp', 0.27190080285072327), ('comptia', 0.26659649

505 . metallurgy :
 [('mineral', 0.4906475245952606), ('chemical', 0.4654209017753601), ('beneficiation', 0.46293386816978455), ('mech', 0.4568750858306885), ('ammonia', 0.453222393989563), ('solvents', 0.45281660556793213), ('coating', 0.451283723115921), ('insecticides', 0.44455471634864807), ('vapor', 0.4373369812965393), ('coke', 0.4351193308830261)]
507 . plastic :
 [('plastics', 0.367947518825531), ('molded', 0.3614462912082672), ('garments', 0.3592539429664612), ('coating', 0.3581622242927551), ('cipet', 0.35812604427337646), ('adhesives', 0.3567819595336914), ('fabrication', 0.3463901877403259), ('moulding', 0.34126442670822144), ('blow', 0.33649519085884094), ('machined', 0.33595165610313416)]
508 . polymer :
 [('thermoset', 0.47515010833740234), ('olefin', 0.4658256471157074), ('chemical', 0.43950656056404114), ('plastics', 0.4352007508277893), ('coating', 0.41999995708465576), ('metallurgy', 0.4102233648300171), ('coatings', 0.4014185667037964), ('additives', 0.4004075527191

559 . cartography :
 [('mammalian', 0.26797541975975037), ('infinitepassion', 0.26555734872817993), ('groupwide', 0.26425573229789734), ('afara', 0.26189056038856506), ('tpci', 0.2592228055000305), ('worning', 0.25619763135910034), ('intretine', 0.2528514862060547), ('uyvdvby', 0.2510038912296295), ('embed', 0.24939092993736267), ('necesar', 0.24274194240570068)]
560 . health :
 [('medical', 0.47126930952072144), ('child', 0.38773244619369507), ('healthcare', 0.3855276107788086), ('prohealth', 0.3723698556423187), ('retardation', 0.35960036516189575), ('cutomer', 0.3434799313545227), ('canine', 0.3305301070213318), ('colostomy', 0.32746046781539917), ('patient', 0.32674098014831543), ('psychiatry', 0.324521005153656)]
562 . outplacement :
 [('symetric', 0.30114609003067017), ('urbane', 0.2828368544578552), ('levon', 0.26921430230140686), ('nytec', 0.26409342885017395), ('lordi', 0.2600770592689514), ('risesmart', 0.256143182516098), ('vinclo', 0.25326770544052124), ('internets', 0.2462

614 . weka :
 [('scipy', 0.46978670358657837), ('scikit', 0.4569120705127716), ('mxnet', 0.44710439443588257), ('numpy', 0.41166549921035767), ('pandas', 0.4070706367492676), ('tensorflow', 0.38686007261276245), ('pytorch', 0.381624311208725), ('caffe', 0.37682604789733887), ('spss', 0.33750057220458984), ('mahout', 0.33478665351867676)]
615 . chatbot :
 [('bot', 0.3507775068283081), ('content', 0.25429609417915344), ('jinie', 0.25222453474998474), ('expands', 0.2421552538871765), ('alexa', 0.23456986248493195), ('sheorey', 0.22919589281082153), ('bots', 0.2231384515762329), ('catalog', 0.22098922729492188), ('moderator', 0.21900784969329834), ('idz', 0.21581605076789856)]
616 . fred :
 [('meyer', 0.48634397983551025), ('jewelers', 0.3992038369178772), ('littman', 0.3872586190700531), ('signet', 0.30299365520477295), ('davis', 0.28747230768203735), ('guan', 0.2752573788166046), ('stores', 0.2752012312412262), ('idaho', 0.27464500069618225), ('flsa', 0.2744661569595337), ('northwest', 0

673 . jive :
 [('hearsay', 0.5890982151031494), ('yammer', 0.38557329773902893), ('rackspace', 0.3631588816642761), ('sharepoint', 0.34591084718704224), ('vasa', 0.3456575870513916), ('eloqua', 0.325941801071167), ('symphony', 0.3190332055091858), ('joomla', 0.3135949373245239), ('nhaw', 0.30385416746139526), ('qualtrics', 0.3003343343734741)]
675 . yammer :
 [('jive', 0.38557329773902893), ('webex', 0.36629801988601685), ('podcasts', 0.33262890577316284), ('slack', 0.3266337215900421), ('uznemirujući', 0.3255472183227539), ('conferencing', 0.3192102909088135), ('suda', 0.31581515073776245), ('webinars', 0.3077336847782135), ('gifs', 0.3043217957019806), ('mediums', 0.28928709030151367)]
676 . middleware :
 [('tibco', 0.34844914078712463), ('websphere', 0.31347960233688354), ('jboss', 0.31253552436828613), ('oracle', 0.3054482340812683), ('jms', 0.3023917078971863), ('bamboo', 0.2999850809574127), ('iib', 0.29873454570770264), ('caching', 0.2978839874267578), ('esb', 0.2958469688892364

796 . splunk :
 [('appdynamics', 0.5075395107269287), ('qradar', 0.4537420868873596), ('grafana', 0.448777973651886), ('dynatrace', 0.4459906220436096), ('datadog', 0.43414318561553955), ('kibana', 0.4311009347438812), ('siem', 0.42352789640426636), ('nagios', 0.4143403172492981), ('cloudwatch', 0.41128700971603394), ('solarwinds', 0.4059225916862488)]
797 . citrix :
 [('xenapp', 0.5531130433082581), ('xendesktop', 0.48227667808532715), ('vmware', 0.4666396379470825), ('vcentre', 0.4448741674423218), ('netscaler', 0.4435131549835205), ('scom', 0.44249218702316284), ('sccm', 0.40696603059768677), ('xenserver', 0.4032374620437622), ('appsense', 0.3943217992782593), ('hypervisor', 0.3787650465965271)]
799 . vmware :
 [('vsphere', 0.5989840030670166), ('netapp', 0.47245410084724426), ('virtualization', 0.4678327143192291), ('citrix', 0.4666396379470825), ('vcentre', 0.46581748127937317), ('vcenter', 0.45779669284820557), ('kvm', 0.44998717308044434), ('esxi', 0.44149038195610046), ('xen', 

862 . telnet :
 [('snmp', 0.5228667259216309), ('ssh', 0.5139473676681519), ('nfs', 0.4252588152885437), ('smtp', 0.4215199947357178), ('dns', 0.41837432980537415), ('tcp', 0.4068189263343811), ('samba', 0.3917385935783386), ('xml', 0.3888358473777771), ('jdbc', 0.38868629932403564), ('udp', 0.37814992666244507)]
863 . ansible :
 [('puppet', 0.6623389720916748), ('terraform', 0.5466092824935913), ('docker', 0.5080425143241882), ('saltstack', 0.49257931113243103), ('dockers', 0.47244924306869507), ('jenkins', 0.4690227508544922), ('kubernetes', 0.4008520245552063), ('grafana', 0.39069509506225586), ('bitbucket', 0.39037176966667175), ('devops', 0.3891891837120056)]
865 . appneta :
 [('appresponse', 0.8729091882705688), ('steelfusion', 0.8710007071495056), ('netim', 0.8163371682167053), ('appinternals', 0.5949571132659912), ('aternity', 0.5712487697601318), ('riverbed', 0.5501452684402466), ('appdynamics', 0.5035571455955505), ('dynatrace', 0.494533896446228), ('infovista', 0.49360749125

928 . yaml :
 [('vvr', 0.5783827304840088), ('terraform', 0.41933536529541016), ('golang', 0.3828461170196533), ('scons', 0.3775169551372528), ('netconf', 0.3591651916503906), ('python', 0.35876819491386414), ('scripting', 0.35590481758117676), ('flannel', 0.353662371635437), ('tcl', 0.34920769929885864), ('codeigniter', 0.34511852264404297)]
929 . uml :
 [('sysml', 0.4419407248497009), ('oop', 0.3705761134624481), ('rhapsody', 0.3503609895706177), ('ooad', 0.34798362851142883), ('programming', 0.34017717838287354), ('jsse', 0.3297401964664459), ('tcl', 0.3248317837715149), ('jce', 0.3228455185890198), ('java', 0.318415105342865), ('object', 0.31702345609664917)]
931 . yang :
 [('chloe', 0.6226190328598022), ('quan', 0.5940192937850952), ('celina', 0.5813344717025757), ('aiko', 0.5497370958328247), ('emma', 0.5433012247085571), ('sophia', 0.5426439046859741), ('mckellar', 0.5151305198669434), ('jennifer', 0.5008330345153809), ('zhang', 0.48900699615478516), ('fok', 0.47630199790000916)

976 . assembler :
 [('jtag', 0.3715067207813263), ('thread', 0.3014291524887085), ('java', 0.27575361728668213), ('mast', 0.27120140194892883), ('malay', 0.2699795663356781), ('tarkett', 0.2662290334701538), ('multithreading', 0.2627820074558258), ('ilog', 0.26205575466156006), ('jvm', 0.25749847292900085), ('activepivot', 0.2508049011230469)]
978 . leap :
 [('bazaar', 0.2744825482368469), ('bets', 0.27212056517601013), ('westwing', 0.26060599088668823), ('beamforming', 0.2603435218334198), ('responder', 0.2590844929218292), ('boredom', 0.2583785057067871), ('safer', 0.2549580931663513), ('reality', 0.2538330554962158), ('lhcmeubwnjyxj', 0.2527619004249573), ('aira', 0.2447919249534607)]
979 . nil :
 [('dues', 0.5197235345840454), ('handing', 0.3562072217464447), ('opy', 0.345854789018631), ('bead', 0.3067563474178314), ('arching', 0.30211159586906433), ('obsess', 0.29440686106681824), ('crowdsourced', 0.29145944118499756), ('ynlvdw', 0.28619760274887085), ('tub', 0.2837628126144409), 

1030 . cucumber :
 [('selenium', 0.5806012153625488), ('bdd', 0.5487393736839294), ('jbehave', 0.5217143893241882), ('spock', 0.4981018006801605), ('fitnesse', 0.493813693523407), ('junit', 0.4721371531486511), ('testng', 0.4613504409790039), ('mockito', 0.44239839911460876), ('gherkin', 0.44197481870651245), ('protractor', 0.4391521215438843)]
1032 . testbench :
 [('systemverilog', 0.4226088225841522), ('rtl', 0.42148518562316895), ('checkers', 0.42096805572509766), ('uvm', 0.420074999332428), ('testcases', 0.3826172947883606), ('transistor', 0.3769119381904602), ('systemc', 0.35509902238845825), ('targetlink', 0.3525603115558624), ('pcie', 0.34987562894821167), ('verilog', 0.3336361050605774)]
1035 . fuzzing :
 [('dmvpn', 0.34678301215171814), ('objection', 0.30755218863487244), ('pki', 0.2917717695236206), ('cellular', 0.2841653823852539), ('nessus', 0.28359389305114746), ('photodiodes', 0.27893951535224915), ('rke', 0.2759029269218445), ('exposureto', 0.27575114369392395), ('malwar

1079 . calibration :
 [('ftir', 0.2785603404045105), ('testing', 0.27404606342315674), ('xrf', 0.27215200662612915), ('test', 0.26518523693084717), ('calibrations', 0.2642897069454193), ('validation', 0.2587173581123352), ('dyno', 0.2536280155181885), ('reagent', 0.2510389983654022), ('metrology', 0.2501923143863678), ('rapports', 0.24985109269618988)]
1082 . jbuilder :
 [('eclipse', 0.40126678347587585), ('jaxb', 0.3623538911342621), ('websphare', 0.33824247121810913), ('osgi', 0.3329842686653137), ('jdeveloper', 0.3314054310321808), ('curat', 0.32924678921699524), ('clearquest', 0.3235469460487366), ('ibatis', 0.32305535674095154), ('obiee', 0.3200720250606537), ('depozitare', 0.3174628019332886)]
1083 . labview :
 [('teststand', 0.49943363666534424), ('labwindows', 0.4689145088195801), ('canoe', 0.4618479609489441), ('ni', 0.38314229249954224), ('debuggers', 0.38178950548171997), ('capl', 0.3631780445575714), ('cvi', 0.3535645008087158), ('pspice', 0.3505173325538635), ('multimeter'

1133 . irish :
 [('jameson', 0.40004175901412964), ('scotch', 0.3929086923599243), ('malibu', 0.3541156053543091), ('ireland', 0.3525160551071167), ('wicklow', 0.3513703942298889), ('whisky', 0.34021276235580444), ('destinova', 0.3249616026878357), ('gin', 0.3231210708618164), ('whiskey', 0.32276594638824463), ('cork', 0.32020097970962524)]
1134 . italian :
 [('hebrew', 0.5035979747772217), ('french', 0.4960130751132965), ('spanish', 0.45925742387771606), ('portuguese', 0.45801955461502075), ('bulgarian', 0.4472503066062927), ('openscape', 0.4456028938293457), ('german', 0.42266014218330383), ('greek', 0.4213045537471771), ('arabic', 0.41522669792175293), ('finnish', 0.41234350204467773)]
1135 . japanese :
 [('bmcunje', 0.46235164999961853), ('korean', 0.37162959575653076), ('mandarin', 0.3542388081550598), ('uscpa', 0.3521803617477417), ('chinese', 0.33798491954803467), ('portuguese', 0.3357255756855011), ('spanish', 0.32893338799476624), ('agvpzgkubgv', 0.3176521062850952), ('jlpt', 

1178 . transactions :
 [('deals', 0.47602057456970215), ('trades', 0.43309468030929565), ('transaction', 0.39414119720458984), ('entries', 0.3581496477127075), ('credit', 0.3367539346218109), ('funds', 0.32928749918937683), ('accounts', 0.32332736253738403), ('mandates', 0.32105571031570435), ('requests', 0.3159668445587158), ('contracts', 0.3130176365375519)]
1179 . reorganisation :
 [('dissolving', 0.304219126701355), ('legal', 0.2680909335613251), ('whatyou', 0.2621232569217682), ('liquidating', 0.2502190172672272), ('dropwires', 0.24985982477664948), ('hmt', 0.23574671149253845), ('iforce', 0.22675174474716187), ('insolvency', 0.22556594014167786), ('prospectus', 0.21892257034778595), ('process', 0.2177383005619049)]
1180 . sce :
 [('ccx', 0.37144723534584045), ('seasonalized', 0.28575342893600464), ('cigarette', 0.28070762753486633), ('citroen', 0.2522971034049988), ('peugeot', 0.25209134817123413), ('transcom', 0.23530001938343048), ('limtied', 0.22552281618118286), ('cse', 0.224

1235 . mechatronics :
 [('electrotechnics', 0.29585039615631104), ('rutiere', 0.290119469165802), ('electrical', 0.26682767271995544), ('mouser', 0.26677048206329346), ('appr', 0.2637588679790497), ('licenciature', 0.26178956031799316), ('autovehicule', 0.2603435814380646), ('science', 0.2563422918319702), ('eplan', 0.24174454808235168), ('enginnering', 0.23863212764263153)]
1236 . corrosion :
 [('soil', 0.45148569345474243), ('solvents', 0.41565394401550293), ('corrosive', 0.4134165048599243), ('cathodic', 0.41315436363220215), ('erosion', 0.40767228603363037), ('paints', 0.4073566496372223), ('vapor', 0.4062739312648773), ('coatings', 0.40609806776046753), ('metallurgy', 0.3934168815612793), ('vibrations', 0.3871593475341797)]
1239 . kuka :
 [('motoman', 0.47701340913772583), ('fanuc', 0.4744639992713928), ('levon', 0.4094548225402832), ('automatizare', 0.40228593349456787), ('efftronics', 0.3977532386779785), ('yaskawa', 0.38877928256988525), ('ubitech', 0.3775162696838379), ('scada

 [('stooping', 0.6368041634559631), ('kneeling', 0.5915015339851379), ('twisting', 0.5720603466033936), ('lifting', 0.5311342477798462), ('crouching', 0.5168494582176208), ('squatting', 0.45313766598701477), ('crawling', 0.4342278242111206), ('climbing', 0.4308854043483734), ('fingering', 0.41957417130470276), ('walking', 0.40745487809181213)]
1282 . casting :
 [('aluminium', 0.47808563709259033), ('machining', 0.4491662383079529), ('cipet', 0.3709501028060913), ('impregnation', 0.3616301119327545), ('machined', 0.35741013288497925), ('maldaner', 0.34148019552230835), ('stamping', 0.33119383454322815), ('aluminum', 0.32494181394577026), ('moulding', 0.3245317339897156), ('die', 0.32328397035598755)]
1284 . polishing :
 [('stripping', 0.4738568365573883), ('sinking', 0.46567970514297485), ('painting', 0.45260089635849), ('washing', 0.45147573947906494), ('waxing', 0.43898630142211914), ('rugs', 0.43153446912765503), ('trimming', 0.4235886335372925), ('shampooing', 0.42051562666893005), 

1324 . aida :
 [('aifm', 0.48830896615982056), ('mnpi', 0.31613466143608093), ('timessquare', 0.2732437252998352), ('schroder', 0.2705504596233368), ('eesti', 0.26822608709335327), ('sfs', 0.2620547413825989), ('securitised', 0.25801652669906616), ('leftpart', 0.25531214475631714), ('tradable', 0.2544821500778198), ('menopt', 0.25249946117401123)]
1325 . promotion :
 [('promotions', 0.4014246165752411), ('promotional', 0.30516305565834045), ('layoffs', 0.28261101245880127), ('natchez', 0.2643430531024933), ('branding', 0.2573978304862976), ('infotronics', 0.25605934858322144), ('layoff', 0.24890974164009094), ('incentive', 0.2410794198513031), ('btl', 0.23742786049842834), ('marketing', 0.2374027967453003)]
1326 . sponsorship :
 [('visa', 0.44370508193969727), ('visas', 0.3936161696910858), ('relocation', 0.35813838243484497), ('sponsor', 0.33637070655822754), ('authorized', 0.2903967499732971), ('tyes', 0.2808583378791809), ('restriction', 0.26875683665275574), ('micromanagement', 0.2

1384 . paramedic :
 [('emt', 0.6014850735664368), ('bcls', 0.49981689453125), ('pals', 0.46021902561187744), ('acls', 0.45614659786224365), ('phlebotomist', 0.44991636276245117), ('rvt', 0.44577446579933167), ('rn', 0.4446284770965576), ('paramedics', 0.4438907504081726), ('cot', 0.4345861077308655), ('ekg', 0.43295782804489136)]
1385 . hematology :
 [('immunology', 0.5656769871711731), ('microbiology', 0.555005669593811), ('coagulation', 0.5309491157531738), ('serology', 0.49902862310409546), ('bacteriology', 0.4871670603752136), ('hematologist', 0.48279550671577454), ('pharmacology', 0.4545583128929138), ('mycology', 0.4485095143318176), ('hematological', 0.43464264273643494), ('molecular', 0.43451806902885437)]
1386 . surgeon :
 [('neuro', 0.6980687379837036), ('neurologist', 0.6341366171836853), ('cardiologist', 0.6321398019790649), ('neurosurgery', 0.6247150301933289), ('ortho', 0.6215180158615112), ('neurology', 0.6123178601264954), ('anaesthetist', 0.5990053415298462), ('intensi

1433 . photonics :
 [('photonic', 0.41232526302337646), ('modulators', 0.390424907207489), ('optics', 0.386584997177124), ('metallurgy', 0.3794790506362915), ('optoelectronic', 0.37211742997169495), ('microscopy', 0.3599189817905426), ('dielectric', 0.3463166356086731), ('detectors', 0.34598517417907715), ('semiconductor', 0.3406631052494049), ('spectrometer', 0.3399394154548645)]
1434 . psychometrics :
 [('inaugural', 0.3363485634326935), ('defender', 0.3320438265800476), ('xnxc', 0.32876360416412354), ('shl', 0.3253658413887024), ('vidyagram', 0.3083075284957886), ('wusf', 0.30342569947242737), ('fuy', 0.2950402498245239), ('servants', 0.28448212146759033), ('nlje', 0.2766815721988678), ('hearty', 0.269339382648468)]
1435 . biostatistics :
 [('epidemiology', 0.5313064455986023), ('immunology', 0.45300203561782837), ('bioinformatics', 0.4440264105796814), ('pharmacology', 0.43389755487442017), ('microbiology', 0.4118228554725647), ('biochemistry', 0.4051993489265442), ('statistician',

1503 . anatomy :
 [('physiology', 0.7658408880233765), ('beings', 0.4616856575012207), ('rays', 0.426972895860672), ('pathology', 0.4257751703262329), ('resouce', 0.42459404468536377), ('ultrasound', 0.40503358840942383), ('ophthalmology', 0.39615654945373535), ('radiographic', 0.39594584703445435), ('histology', 0.3884315490722656), ('tissues', 0.38561415672302246)]
1505 . infections :
 [('diseases', 0.569729208946228), ('communicable', 0.5065743327140808), ('illnesses', 0.5015957355499268), ('disease', 0.4882206320762634), ('infectious', 0.4690267741680145), ('depression', 0.4689388573169708), ('newborns', 0.4661564826965332), ('hypertension', 0.45241689682006836), ('colds', 0.45182234048843384), ('asthma', 0.44920551776885986)]
1506 . pediatrics :
 [('nicu', 0.6317653059959412), ('orthopedics', 0.6109651327133179), ('endocrinology', 0.596618115901947), ('gynecology', 0.5948292016983032), ('neonatology', 0.593416154384613), ('nephrology', 0.579390287399292), ('pediatric', 0.578712701

In [116]:
## keys include special charaters, (for the phrase2vec model, ignore them first)
number_letter = re.compile(r'^[a-zA-Z0-9\s]*$')

key_special = [key.lower() for key in keys if not number_letter.search(key)]
print(len(key_special))

892


In [238]:
## Phrase2Vec

example = job_description_en[2310]

from gensim.models.phrases import Phrases, Phraser

clean_sample = [[u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'], 
 [u'machine', u'learning', u'can', u'be', u'new', u'york' , u'sometimes']]
#gensim.utils.simple_preprocess(clean_str(example))

## stop list
stop_words = set(stopwords.words('english'))
words_need = ['s','and', 'of']
for word in words_need:
    stop_words.remove(word)


example = space.sub(" ", example).strip()

punct = re.compile("[.,;!?:*()=/\\\[\]•#_§\$€]+")  # heavily depends on the text
coarse_candidates = punct.split(example.lower())
## clean whitespace and empty string
candidates = [sent.strip() for sent in coarse_candidates if sent.strip()]

for sent in candidates:
    sent = re.split('\s+', sent)
    a = ["|" if w in stop_words else w for w in sent]
    
    print(a)
    

#print(candidates)

#phrases = Phrases(clean_sample, min_count=1, threshold=1)
#bigram = Phraser(phrases)
#bigram[clean_sample[1]]

['working', 'student', '-', 'virtual', 'integration', 'environment']
['|']
['f']
['description']
['|', '|', 'area', 'of', 'virtual', 'integration', 'environment', '|', '|', 'looking', '|', '|', 'working', 'student']
['|']
['f']
['|', 'lindau', 'beginning', '|', 'soon', '|', 'possible', '|', '4', '|', '6', 'months', 'and', 'max']
['20', 'hours', 'per', 'week']
['|', 'working', 'student', '|', '|', '|', '|', 'possibility', 'of', 'working', '|', '|', 'challenging', 'environment', '|', 'state-of-the-art', 'adas', 'technologies', 'and', 'enjoy', 'setting', '|', 'automated', 'testing', 'environments', '|', '|', 'critical', '|', '|', 'success', 'of', '|', 'integration', 'activities']
['|', 'tasks', 'include']
['implementation', 'of', 'automation', '|', 'testing', 'scripts']
['standardization', 'among', 'different', 'technologies']
['unit', 'testing', '|', '|', 'tooling']
['roll-out', 'of', 'scripts', 'and', 'presentation']
['technical', 'support']
['assessment', 'of', 'results']
['qualificati

In [184]:
## match and connect words

#text = "Surround View / Omniview Mobile In-Vehicle Applications Vehicle-Network Radar - ARSx, Technology, Media & Communications \
#        Aeronautics, Aircraft, Aviation, Aircraft Wing, jet Engine, DO178B, avionics A330"

def match(text):
    for key in keys:
        p = re.escape(key)
        c = re.compile(r"\b"+ p + r"\b", re.I)
        key_re = key.replace(" ", "_")
        text = c.sub(key_re, text)
    return text

#    matched_text = match(clean_text)
#    whole_text.append(matched_text)

100% (170060 of 170060) |################| Elapsed Time: 0:00:53 Time:  0:00:53


In [231]:
whole_text = " | ".join(clean_text[:500])
p = []
for key in keys:
    p.append(re.escape(key))

p = "|".join(p)
c = re.compile(r"\("+ p + r"\)", re.I)
match = c.sub(lambda match: match.group(0).replace(" ","_"), whole_text)


print(match)

"""for key in pp.progressbar(keys):
    p = re.escape(key)
    c = re.compile(r"\b"+ p + r"\b", re.I)
    key_re = key.replace(" ", "_")
    whole_text = c.sub(key_re, whole_text)  

print(whole_text)"""



'for key in pp.progressbar(keys):\n    p = re.escape(key)\n    c = re.compile(r"\x08"+ p + r"\x08", re.I)\n    key_re = key.replace(" ", "_")\n    whole_text = c.sub(key_re, whole_text)  \n\nprint(whole_text)'

In [224]:
## for test

dt = "a b, ef,c d e ,kk, c d e   , a b,"

#c = re.compile(r"\ba\sb|c\sd\se\b", re.I)
#c.sub(lambda m: m.group(0).replace(" ","_"), dt)

mass = "ABCD EFGH IJKL"
part = re.sub('(a\sb|c\sd\se).', lambda match: match.group(1).replace(" ","_"), dt)
print(part)

a_b, ef,c_d_e_,kk, c_d_e_  , a_b,


In [187]:
re.split(r"\|", whole_text)

['Wealth Management - Credit_risk Analyst Wealth Management WM provides advice strategies and solutions for all aspects of the financial asset management and wealth transfer needs of high net worth and ultra-high net worth clients. WM is dedicated to delivering an outstanding client experience by providing unbiased advice and individual solutions. Credit_risk assesses permissions and manages credit and counterparty risks on an Industry client geographic and transaction basis. Credit_risk is the risk of loss arising from the default of a client or counterparty. The Credit_risk function identifies measures limits manages and monitors Credit_risk across our businesses. Credit exposure arises through underwriting lending and trading activities with and for clients and counterparties as well as from a range of operating services such as Cash_Management and clearing activities. Responsibilities Evaluate WM consumer and Business purpose credit requests utilizing applicable Credit_risk Policie

In [109]:
text = "Surround View / Omniview Mobile In-Vehicle Applications Vehicle-Network Radar - ARSx , \
        Aeronautics, Aircraft, Aviation, Aircraft Wing, jet Engine, DO178B, avionics A330"
a = []
for key in key_sp:
    p = re.escape(key)
    c = re.compile(r"\b"+ p + r"\b", re.I)
    key_re = key.replace(" ", "_")
    text = c.sub(key_re, text)

    #a.append(key)

print(text)

#c = re.compile("[,;\'\&\(\)]+")
#    key = re.escape(key)
#    re.sub(c, r"\\\1", key) 
#    str.replace(/[<>*()?]/g, "\\$&");
#    key.replace(/[()]/g, "\\$&")
    #if c.findall(key):
        #key = re.sub(c, '\', 
#    key_special.append(key)
    
#key_special

Surround  View / Omniview Mobile_In-Vehicle_Applications Vehicle-Network Radar_-_ARSx Technology,_Media_&_Communications,         Aeronautics, Aircraft, Aviation, Aircraft Wing, jet Engine, DO178B, avionics A330


In [53]:
#for n, job_text in enumerate(job_description_df.job_description[:1000]):
#    string = "Microsoft Office"
#    skill_re = " "+string.replace(" ", "_")+" "
#    match = re.findall(string, job_text, flags=re.I)
#    if match:
#        print(n)
#        sub_text = re.sub(string, skill_re, job_text, flags=re.I)

#print(sub_text)


#c=re.compile(r"\bC\b",re.I)
#c.findall(example)

example = job_description_en[141]
#print(example)

for i, key in enumerate(skills):
    c = re.compile(r"\b"+key+r"\b",re.I)
    match = c.findall(example)
    if match:
        key_re = key.replace(" ", "_")
        print(key_re)
        print("match!", key, key_re)
        example = c.sub(key_re, example)
    #except:
    #    print(key)

#print(example)
#    if example.find(key) != -1:
#        print(i,key, "match!")
#        print(key.replace(" ", "_"))
#        example.replace(key, key.replace(" ", "_"))
    
#print(example)


#print(skills)
#f = open("../SkillMatch/skill_name.csv")
#keys = f.read()
#print(keys)

#pattern = re.compile(r'\b(' + keys + r')\b\s*', re.IGNORECASE)

#for text in job_description_en[:1000]:
#    match = re.findall(pattern, clean_str(example))
#    if match:
#        print(match)

error: multiple repeat at position 4

In [24]:
# remove stopwords

stop_words = set(stopwords.words('english'))
stop_words.remove('s')

def rm_stopwords(s):
    pattern = 
    text = pattern.sub('', s)
    return text

print(rm_stopwords(example))
#filtered_text = [w for w in word_tokenize(example) if not w in stop_words]
#filtered_text


from textacy import preprocess
#    s = preprocess.preprocess_text(text, fix_unicode=True, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True)  ## remove url

SyntaxError: invalid syntax (<ipython-input-24-964b3d642a11>, line 7)

In [50]:
for text in tqdm(job_description_en[:10000]):
    clean_str(text)
    

100%|██████████| 10000/10000 [00:03<00:00, 2817.67it/s]


In [30]:
def compute_ngrams(tokens, max_len = None, min_len = 1):
    """tokens  :   iterable of string
                    a single sentence of tokens. Assumes start and stop tokens omitted
    max_len :   int
                    maximum ngram length
    min_len :   int
                    minimum ngram length

    """
    if max_len == None:
        max_len = len(tokens)

    if min_len > max_len:
        raise Exception("min_len cannot be more than max_len")

    ngrams = set()
    # unigrams
    for ngram_size in range(min_len, max_len + 1):
        for start in range(0, len(tokens) - ngram_size + 1):
            end = start + ngram_size -1
            words = []
            for i in range(start, end + 1):
                words.append(tokens[i])
            ngrams.add(tuple(words)) # make a tuple so hashable
    return ngrams

# is a valid token
__bad_chars__ = "<>{}[]~@"
__punct__ = set(".?!,;:")
def is_valid_term(term):
    # remove single char entries and only numeric
    if len(term) == 0:
        return False
    if len(term) == 1:
        #else misses c and r
        if term.isalpha():
            return True
        return False
    # no html or js terms
    for c in __bad_chars__:
        if c in term:
            return False
    if term[-1] in __punct__:
        return False
    if "function(" in term:
        return False
    if "!" in term or "?" in term:
        return False
    digit_chars = 0.0
    for c in term:
        if c.isdigit() or not c.isalnum():
            digit_chars += 1.0
    # 60% digits?
    if (digit_chars / len(term)) >= 0.75:
        return False
    return True



In [None]:
re_collapse_spaces = re.compile("\s+")

# collapse space
def collapse_spaces(s):
    return re_collapse_spaces.sub(" ", s).strip()

In [None]:
## Define variables from database

job_table = "jobs"
job_id_column = "job_id"

job_skill_table = "job_skill_mapping"
skill_id_column = "skill_id"

skill_tree_table = "dem_skill"
skill_tree_id_column = "skill_id"
skill_tree_parent_column = "skill_parent_id"
skill_tree_level_column = "skill_hierarchy_id"

skill_name_table = "language_dem_skill"
skill_name_id_column = "skill_id"
skill_name_name_column = "skill_name"
skill_name_language_id = "language_id"


# create id skill dictionary (level 5)
command = 'select skill_id, skill_name from language_dem_skill \
            where skill_id in (select skill_id from dem_skill where skill_hierarchy_id = 5)'


skills_level5 = pd.read_sql_query(text(command), con = engine)
skills_level5 = skills_level5.drop_duplicates(subset="skill_id", keep='first', inplace=False)
skills_level5

In [None]:
#values = re.findall(pattern=measurement, string=text)
#sub_text = re.sub(pattern=measurement, string=text, repl='MEASUREMENT')

for n, job_text in enumerate(job_description_df.job_description[:1000]):
    string = "Microsoft Office"
    skill_re = " "+string.replace(" ", "_")+" "
    match = re.findall(string, job_text, flags=re.I)
    if match:
        print(n)
        sub_text = re.sub(string, skill_re, job_text, flags=re.I)
        print(sub_text)

In [None]:
## find skills containing regular and special characters

key_sp = []
key_reg = []
for key in keys:
    if re.match("^[A-Za-z0-9_\s]*$", key):
        key_reg.append(key)
    else:
        key_sp.append(key)
print(key_reg[:10])   
print(key_sp[:30])