## Feature engineering for job segmentation

In [30]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from string import punctuation

In [41]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qiuruihao/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Load data

In [31]:
df_train = pd.read_csv("../data/train_data.csv")
df_test = pd.read_csv("../data/test_data.csv")

stop_words = stopwords.words('english')

In [109]:
def load_data():
    x_train = df_train["content"]
    y_train = df_train["label"]

    x_test = df_test["content"]
    y_test = df_test["label"]
    return x_train, y_train, x_test, y_test

In [110]:
x_train, y_train, x_test, y_test = load_data()

### Clean text

In [111]:
email_regex = re.compile(r"[\w.-]+@[\w.-]+")
url_regex = re.compile(r"(http|www)[^\s]+")
date_regex = re.compile(r"[\d]{2,4}[ -/:]*[\d]{2,4}([ -/:]*[\d]{2,4})?") # a way to match date

def clean_special_patterns(text):
    """Remove special patterns - email, url, date etc."""
    text = url_regex.sub("", text)
    text = email_regex.sub("", text)
    text = date_regex.sub("", text)
    return text

In [112]:
train_corpus = x_train.apply(clean_special_patterns)
test_corpus = x_test.apply(clean_special_patterns)

### Vectorization

In [113]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,2),
    sublinear_tf=True
)
Vec = vectorizer.fit(train_corpus)

In [114]:
X_train = Vec.transform(train_corpus)
X_test = Vec.transform(test_corpus)

In [116]:
X_train.shape, X_test.shape

((2103, 90384), (902, 90384))

### Dimension reduction

In [117]:
n_size = 50
svd = TruncatedSVD(
  n_components=n_size,
  n_iter=10,
  random_state=42
  )

svd_fit = svd.fit(X_train)
X_train = svd_fit.transform(X_train)
X_test = svd_fit.transform(X_test)

In [121]:
feature_name = ["svd_"+str(i) for i in range(n_size)]
df_svd_train = pd.DataFrame(X_train, columns=feature_name)
df_svd_test = pd.DataFrame(X_test, columns=feature_name)

### Creat features
- Text-based features

In [85]:
def count_words(text):
    return len(str(text).split())

def count_uniquewords(text):
    return len(set(str(text).split()))

def count_chars(text):
    return len(str(text))

def word_density(text):
    return count_chars(text) / (count_words(text) + 1)

def count_stopwords(text):
    stopwords = [word for word in str(text).split() if word in stop_words]
    return len(stopwords)

def count_puncts(text):
    puncts = re.findall('[' + punctuation + ']', str(text))
    return len(puncts)

def count_upperwords(text):
    upperwords = re.findall(r"\b[A-Z0-9]+\b", str(text))
    return len(upperwords)

def count_firstwords(text):
    """count first word of sentence"""
    firstwords = re.findall(r"\b[A-Z][a-z]+\b", str(text))
    return len(firstwords)

In [99]:
def make_features(text_series):
    df_features = pd.DataFrame()

    df_features["word_count"] = text_series.apply(count_words)
    df_features["uniqueword_count"] = text_series.apply(count_uniquewords)
    df_features["char_count"] = text_series.apply(count_chars)
    df_features["word_density"] = text_series.apply(word_density)
    df_features["stopword_count"] = text_series.apply(count_stopwords)
    df_features["punct_count"] = text_series.apply(count_puncts)
    df_features["upperword_count"] = text_series.apply(count_upperwords)
    df_features["firstword_count"] = text_series.apply(count_firstwords)
    return df_features

In [100]:
df_text_train = make_features(train_corpus)
df_text_test = make_features(test_corpus)

- part of speech tagging

In [38]:
from nltk import pos_tag

pos_dic = {
    "NN" : "noun", "NNS" : "noun", "NNP": "noun", "NNPS" : "noun",
    "PRP" : "pron", "PRP$" : "pron", "WP" : "pron", "WP$" : "pron",
    "VB" : "verb", "VBD" : "verb", "VBG" : "verb", "VBN" : "verb", "VBP" : "verb", "VBZ": "verb",
    "JJ" : "adj", "JJR" : "adj", "JJS" : "adj",
    "RB"  : "adv", "RBR" : "adv", "RBS" : "adv", "WRB" : "adj"
}

def count_tag(text):
    pos_counts = {
        "noun": 0, "pron": 0, "verb": 0, "adj": 0, "adv": 0
    }
    for w, p in pos_tag(str(text).split()):
        try:
            tag = pos_dic[p]
            pos_counts[tag] = pos_counts[tag] + 1
        except KeyError:
            pass
    return pos_counts

In [102]:
pos_dict_train = train_corpus.apply(count_tag)
df_pos_train = pd.DataFrame(list(pos_dict_train))

pos_dict_test = test_corpus.apply(count_tag)
df_pos_test = pd.DataFrame(list(pos_dict_test))

In [103]:
df_features_train = pd.concat([df_text_train, df_pos_train], axis=1)
df_features_test = pd.concat([df_text_test, df_pos_test], axis=1)

### merge all features and output

In [124]:
df_train_out = pd.concat([df_svd_train, df_features_train], axis=1)
df_test_out = pd.concat([df_svd_test, df_features_test], axis=1)

df_train_out["label"] = y_train
df_test_out["label"] = y_test

In [129]:
df_train_out.to_csv("train_data.csv")
df_test_out.to_csv("test_data.csv")

### More about feature engineer

In [189]:
from nltk.tag import StanfordNERTagger

stanford_dir = "/home/ruihaoqiu/stanford-ner-2018-10-16/"
jarfile = stanford_dir + 'stanford-ner.jar'
modelfile = stanford_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'

st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)

In [190]:
def count_ner(text):
    ner_counts = dict()
    ners = st.tag(str(text).split())
    print(ners)
    for _, p in ners:
        if p in ner_counts:
            ner_counts[p] = ner_counts[p] + 1
        else:
            ner_counts[p] = 1
    return ner_counts

In [196]:
text = "Christian is living in Berlin and working at BMW and google, but Amazon is a nice company"

In [199]:
ner_out = []
for doc in corpus[:10]:
    ner_out.append(count_ner(doc))

[('In', 'O'), ('cooperation', 'O'), ('with', 'O'), ('other', 'O'), ('central', 'O'), ('headquarter', 'O'), ('teams', 'O'), ('you', 'O'), ('will', 'O'), ('provide', 'O'), ('general', 'O'), ('administrative', 'O'), ('support', 'O'), ('for', 'O'), ('Communication', 'O'), ('and', 'O'), ('Deployment', 'O'), ('of', 'O'), ('Global', 'ORGANIZATION'), ('Digital', 'ORGANIZATION'), ('Marketing', 'ORGANIZATION'), ('Online', 'ORGANIZATION'), ('Campaigns', 'ORGANIZATION'), ('and', 'O'), ('Activities', 'O'), ('(as', 'O'), ('part', 'O'), ('of', 'O'), ('Global', 'O'), ('Brand', 'O'), ('Marketing).', 'O'), ('Be', 'O'), ('part', 'O'), ('of', 'O'), ('shaping', 'O'), ('the', 'O'), ('marketing', 'O'), ('future', 'O'), ('for', 'O'), ('Jägermeister', 'O'), ('in', 'O'), ('leading', 'O'), ('digital', 'O'), ('channels', 'O'), ('such', 'O'), ('as', 'O'), ('Instagram,', 'O'), ('YouTube,', 'O'), ('Facebook,', 'O'), ('our', 'O'), ('owned', 'O'), ('Websites', 'O'), ('and', 'O'), ('many', 'O'), ('more.', 'O'), ('The',

[('Assisting', 'O'), ('with', 'O'), ('the', 'O'), ('delivery', 'O'), ('of', 'O'), ('safety', 'O'), ('performance', 'O'), ('and', 'O'), ('monitoring,', 'O'), ('in', 'O'), ('line', 'O'), ('with', 'O'), ('personnel,', 'O'), ('team', 'O'), ('and', 'O'), ('functional', 'O'), ('goals', 'O'), ('and', 'O'), ('objectives', 'O'), ('Facilitating', 'O'), ('andor', 'O'), ('arranging', 'O'), ('training', 'O'), ('programs', 'O'), ('on', 'O'), ('critical', 'O'), ('EHS', 'O'), ('issues', 'O'), ('Supporting', 'O'), ('the', 'O'), ('implementation', 'O'), ('of', 'O'), ('EHS', 'O'), ('programs', 'O'), ('within', 'O'), ('Operations,', 'O'), ('while', 'O'), ('delivering', 'O'), ('key', 'O'), ('risk', 'O'), ('assessment', 'O'), ('activities', 'O'), ('Driving', 'O'), ('compliance', 'O'), ('with', 'O'), ('local', 'O'), ('and', 'O'), ('EU', 'ORGANIZATION'), ('legislation', 'O'), ('Ensuring', 'O'), ('that', 'O'), ('EHS', 'ORGANIZATION'), ('systems', 'O'), ('are', 'O'), ('kept', 'O'), ('accurate', 'O'), ('at', 'O'

[('Professional', 'O'), ('Requirements', 'O'), ('5+', 'O'), ('years', 'O'), ('of', 'O'), ('experience', 'O'), ('as', 'O'), ('automotive', 'O'), ('management', 'O'), ('consultant', 'O'), ('First-hand', 'O'), ('knowledge', 'O'), ('of', 'O'), ('auditing', 'O'), ('organizations,', 'O'), ('maturity', 'O'), ('models,', 'O'), ('and', 'O'), ('change', 'O'), ('management', 'O'), ('Know-how', 'O'), ('in', 'O'), ('information', 'O'), ('security', 'O'), ('(e.g.', 'O'), ('ISOIEC', 'O'), ('x),', 'O'), ('cyber', 'O'), ('security', 'O'), ('(e.g.', 'O'), ('ISOSAE', 'O'), ('CD', 'O'), (',', 'O'), ('SAE', 'O'), ('J),', 'O'), ('or', 'O'), ('safety', 'O'), ('(e.g.', 'O'), ('ISO', 'O'), (')', 'O'), ('Outstanding', 'O'), ('professional', 'O'), ('presence', 'O'), ('with', 'O'), ('excellent', 'O'), ('communication', 'O'), ('and', 'O'), ('presentation', 'O'), ('skills', 'O'), ('in', 'O'), ('German', 'O'), ('&amp;', 'O'), ('English', 'O'), ('Recognized', 'O'), ('by', 'O'), ('clients', 'O'), ('and', 'O'), ('peers

In [200]:
ner_out

[{'O': 149, 'ORGANIZATION': 9, 'LOCATION': 1},
 {'O': 85, 'ORGANIZATION': 2},
 {'O': 231, 'ORGANIZATION': 9},
 {'O': 164, 'ORGANIZATION': 7},
 {'O': 428, 'LOCATION': 3, 'ORGANIZATION': 6},
 {'O': 168},
 {'O': 3},
 {'O': 29},
 {'O': 57},
 {'O': 87, 'PERSON': 1, 'LOCATION': 1}]