## Feature engineering for job segmentation

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from string import punctuation

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

### Load data

In [None]:
df_train = pd.read_csv("../data/train_data.csv")
df_test = pd.read_csv("../data/test_data.csv")

stop_words = stopwords.words('english')

In [None]:
def load_data():
    x_train = df_train["content"]
    y_train = df_train["label"]

    x_test = df_test["content"]
    y_test = df_test["label"]
    return x_train, y_train, x_test, y_test

In [None]:
x_train, y_train, x_test, y_test = load_data()

### Clean text

In [None]:
email_regex = re.compile(r"[\w.-]+@[\w.-]+")
url_regex = re.compile(r"(http|www)[^\s]+")
date_regex = re.compile(r"[\d]{2,4}[ -/:]*[\d]{2,4}([ -/:]*[\d]{2,4})?") # a way to match date

def clean_special_patterns(text):
    """Remove special patterns - email, url, date etc."""
    text = url_regex.sub("", text)
    text = email_regex.sub("", text)
    text = date_regex.sub("", text)
    return text

In [None]:
train_corpus = x_train.apply(clean_special_patterns)
test_corpus = x_test.apply(clean_special_patterns)

### Vectorization

In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,2),
    sublinear_tf=True
)
Vec = vectorizer.fit(train_corpus)

In [None]:
X_train = Vec.transform(train_corpus)
X_test = Vec.transform(test_corpus)

In [None]:
X_train.shape, X_test.shape

### Dimension reduction

In [None]:
n_size = 50
svd = TruncatedSVD(
  n_components=n_size,
  n_iter=10,
  random_state=42
  )

svd_fit = svd.fit(X_train)
X_train = svd_fit.transform(X_train)
X_test = svd_fit.transform(X_test)

In [None]:
feature_name = ["svd_"+str(i) for i in range(n_size)]
df_svd_train = pd.DataFrame(X_train, columns=feature_name)
df_svd_test = pd.DataFrame(X_test, columns=feature_name)

### Creat features
- Text-based features

In [None]:
def count_words(text):
    return len(str(text).split())

def count_uniquewords(text):
    return len(set(str(text).split()))

def count_chars(text):
    return len(str(text))

def word_density(text):
    return count_chars(text) / (count_words(text) + 1)

def count_stopwords(text):
    stopwords = [word for word in str(text).split() if word in stop_words]
    return len(stopwords)

def count_puncts(text):
    puncts = re.findall('[' + punctuation + ']', str(text))
    return len(puncts)

def count_upperwords(text):
    upperwords = re.findall(r"\b[A-Z0-9]+\b", str(text))
    return len(upperwords)

def count_firstwords(text):
    """count first word of sentence"""
    firstwords = re.findall(r"\b[A-Z][a-z]+\b", str(text))
    return len(firstwords)

In [None]:
def make_features(text_series):
    df_features = pd.DataFrame()

    df_features["word_count"] = text_series.apply(count_words)
    df_features["uniqueword_count"] = text_series.apply(count_uniquewords)
    df_features["char_count"] = text_series.apply(count_chars)
    df_features["word_density"] = text_series.apply(word_density)
    df_features["stopword_count"] = text_series.apply(count_stopwords)
    df_features["punct_count"] = text_series.apply(count_puncts)
    df_features["upperword_count"] = text_series.apply(count_upperwords)
    df_features["firstword_count"] = text_series.apply(count_firstwords)
    return df_features

In [None]:
df_text_train = make_features(train_corpus)
df_text_test = make_features(test_corpus)

- part of speech tagging

In [None]:
from nltk import pos_tag

pos_dic = {
    "NN" : "noun", "NNS" : "noun", "NNP": "noun", "NNPS" : "noun",
    "PRP" : "pron", "PRP$" : "pron", "WP" : "pron", "WP$" : "pron",
    "VB" : "verb", "VBD" : "verb", "VBG" : "verb", "VBN" : "verb", "VBP" : "verb", "VBZ": "verb",
    "JJ" : "adj", "JJR" : "adj", "JJS" : "adj",
    "RB"  : "adv", "RBR" : "adv", "RBS" : "adv", "WRB" : "adj"
}

def count_tag(text):
    pos_counts = {
        "noun": 0, "pron": 0, "verb": 0, "adj": 0, "adv": 0
    }
    for w, p in pos_tag(str(text).split()):
        try:
            tag = pos_dic[p]
            pos_counts[tag] = pos_counts[tag] + 1
        except KeyError:
            pass
    return pos_counts

In [None]:
pos_dict_train = train_corpus.apply(count_tag)
df_pos_train = pd.DataFrame(list(pos_dict_train))

pos_dict_test = test_corpus.apply(count_tag)
df_pos_test = pd.DataFrame(list(pos_dict_test))

In [None]:
df_features_train = pd.concat([df_text_train, df_pos_train], axis=1)
df_features_test = pd.concat([df_text_test, df_pos_test], axis=1)

### merge all features and output

In [None]:
df_train_out = pd.concat([df_svd_train, df_features_train], axis=1)
df_test_out = pd.concat([df_svd_test, df_features_test], axis=1)

df_train_out["label"] = y_train
df_test_out["label"] = y_test

In [None]:
df_train_out.to_csv("train_data.csv")
df_test_out.to_csv("test_data.csv")

### More about feature engineer

In [None]:
from nltk.tag import StanfordNERTagger

stanford_dir = "/home/ruihaoqiu/stanford-ner-2018-10-16/"
jarfile = stanford_dir + 'stanford-ner.jar'
modelfile = stanford_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'

st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)

In [None]:
def count_ner(text):
    ner_counts = dict()
    ners = st.tag(str(text).split())
    print(ners)
    for _, p in ners:
        if p in ner_counts:
            ner_counts[p] = ner_counts[p] + 1
        else:
            ner_counts[p] = 1
    return ner_counts

In [None]:
text = "Christian is living in Berlin and working at BMW and google, but Amazon is a nice company"

In [None]:
count_ner(text)