# Data Preprocessing

## 1 - Basic Analysis

### 1.1 - Import Packages

In [1]:
import re
import pandas as pd
from spellchecker import SpellChecker

from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer

from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

### 1.2 - Reading Data

In [2]:
# Read data from local pathway

project_path = "/Users/apple/Documents/Xinzhi/Data/"
data = pd.read_csv(project_path + "meta_training_data_jd_0401.csv")

data.head(3)

Unnamed: 0,Type,Code,fun1_code,fun2_code,jobname_ch,jobname_en,id,coname,poname,jd,source
0,FUN,BRO,BRO,BRO,商业研究,Business Research,279011,Aldevron,"Strategic Account Manager, Nucleic Acids Busin...",\n\nWe are currently seeking a Strategic Accou...,monster
1,FUN,BRO,BRO,BRO,商业研究,Business Research,287601,Synergy America,IT Business Analyst,\n\nWe are looking for an IT Business Analyst ...,monster
2,FUN,BRO,BRO,BRO,商业研究,Business Research,340361,A3 Smart Home,Sr. Business Analyst,"\n\nWe call our club's vision, mission, values...",monster


In [3]:
# get stop words

def get_stopwords(words_file):
    stopwords = []
    with open(words_file, 'r') as file:
        for word in file.readlines():
            stopwords.append(word.strip('\n'))
    
    return stopwords


stopwords = get_stopwords("stop_words.txt")

### 1.3 - Data Exploration

In [4]:
print("The number of tier 1 job title: ", len(data.fun1_code.unique()))
print("The number of tier 2 job title: ", len(data.Code.unique()))

The number of tier 1 job title:  97
The number of tier 2 job title:  709


In [5]:
# Class 'Code' that only have 1 instance
jd_count = data.groupby('Code').count()
jd_count[jd_count['jd']==1]

Unnamed: 0_level_0,Type,fun1_code,fun2_code,jobname_ch,jobname_en,id,coname,poname,jd,source
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AFB080,1,1,1,1,1,1,1,1,1,1
AFY660,1,1,1,1,1,1,1,1,1,1
AHR165,1,1,1,1,1,1,1,1,1,1
AMS900,1,1,1,1,1,1,1,1,1,1
APM080,1,1,1,1,1,1,1,1,1,1
APM900,1,1,1,1,1,1,1,1,1,1
AVO060,1,1,1,1,1,1,1,1,1,1
BNV,1,1,1,1,1,1,1,1,1,1
YHF020,1,1,1,1,1,1,1,1,1,1


### 1.4 -  JD List

In [6]:
# Get the list of jds
jd_list = data.jd.tolist()

## 2 - Cleaning

### 2.1 - Word segmentation

In [7]:
# Test base: `text_to_word_sequence`
def segment_text_list(text_list):
    text_list_segmented = [text_to_word_sequence(text) for text in text_list]
    
    return text_list_segmented

### 2.2 - Correct misspelled words

In [8]:
class CheckSpell():
    def __init__(self):
        self.spell = SpellChecker()
    
    def find_unknown(self, text_list_segmented):
        unkonwn_words = [list(self.spell.unknown(word_list)) for word_list in text_list_segmented]
        return unkonwn_words
    
    def spell_correction(self, text_list_segmented):
        corrected = [[self.spell.correction(word) for word in word_list] for word_list in text_list_segmented]
        return corrected

### 2.3 - Keep pure words

In [9]:
# clean the segmented text list, keep only pure english words        

def keep_pure_text(text_list_segmented):
    for word_list in text_list_segmented:
        for word in word_list:
            match = re.findall(r'^[a-z]+$', word)
            if match:
                pass
            else: 
                word_list.remove(word)
    
    return text_list_segmented

### 2.4 - Filter Stopwords

In [10]:
def filter_stopwords(text_list_segmented, stop_words):
    filtered_words = [[word for word in word_list if word not in stop_words]
                      for word_list in text_list_segmented]

    return filtered_words

### 2.5 - Stemming

In [11]:
def stemer_starter(text_list_segmented, in_stemmer='Porter'):
    if in_stemmer == 'Snowball':
        stemmer = SnowballStemmer("english")
    else:
        stemmer = PorterStemmer()
    
    result = [[stemmer.stem(word) for word in word_list] for word_list in text_list_segmented]
    
    return result

In [12]:
def test_stem(text_list_segmented, in_stemmer):
    print("\nFeature count before stem: ", get_feature_count(text_list_segmented))
    stemmed = stemer_starter(text_list_segmented, in_stemmer)
    print("Feature count after stem: ", get_feature_count(stemmed))

In [13]:
# stem_test_text = [['caresses', 'flies', 'dies', 'mules', 'denied',
#                    'died', 'agreed', 'owned', 'humbled', 'sized',
#                    'meeting', 'stating', 'siezing', 'itemization',
#                    'sensational', 'traditional', 'reference', 'colonizer',
#                    'plotted'],
#                   ['try', 'tries', 'tring', 'apple', 'apples', 'watch', 'watches',
#                    'teeth', 'tooth', 'foot', 'feet']]

In [14]:
# print("***Porter Stemmer: ")
# print(stemer_starter(stem_test_text, 'Porter'))
# test_stem(stem_test_text, 'Porter')

# print("\n***Snowball Stemmer: ")
# print(stemer_starter(stem_test_text, 'Porter'))
# test_stem(stem_test_text, 'Porter')

## 3 - Tokenization

In [15]:
class TextTokenizer():
    def __init__(self, texts=None):
        self.texts = texts   
        self.tokenizer = Tokenizer()
        if texts is not None:
            self.tokenizer.fit_on_texts(texts)
            
#         tokenizer.word_counts
#         tokenizer.word_docs
#         tokenizer.word_index
#         tokenizer.document_count
        
    def train(self, train_text):
        self.tokenizer.fit_on_texts(train_text)

    def to_sequences(self, in_text=None):
        if in_text is None:
            return self.tokenizer.texts_to_sequences(self.texts)
        else:
            return self.tokenizer.texts_to_sequences(in_text)
    
    def max_length(self):
        sequences = self.to_sequences()
        lenth = []
        for i in sequences:
            lenth.append(len(i))
        mx_lenth = max(lenth)
        
        print(lenth.index(mx_lenth), mx_lenth)
    
    
    def max_length2(self):  
        sequences = self.to_sequences()
        lenth = 0
        for i in sequences:
            if len(i) > lenth:
                lenth = len(i)
                idx = sequences.index(i)
            else:
                continue

        print(idx, lenth)
    

In [16]:
def get_feature_count(text_list_segmented):
    tk = TextTokenizer(text_list_segmented)
    feature_count = len(tk.tokenizer.word_counts)
    
    return feature_count

## 4 - Evaluation

### 4.1 - Test on test data

In [17]:
test_jd = ['Job DescriptionImportant Note: During the  application process, ensure your contact information (email and phone number)  is up to date. The invitation can be sent by both email and  text message. In order to receive text message invitations, your profile must  include a mobile phone number designated as “Personal Cell” or “Cellular” in  the contact information of your application.At Wells Fargo, we want to  satisfy our customers’ financial needs and help them succeed financially.  We’re looking for talented people who will put our customers at the center of  everything we do.',
           'This role will be based in Charlotte, but will consider other hub  locations.Required Qualifications10 + years of experience in compliance,  operational risk management(includes audit, legal, credit risk, market risk, or the management of a process or business with accountability for compliance or operational risk), or a combination of both; or 10 + years of IT systems  security, business process management or financial services industry  experience, of which 5 + years must include direct experience in compliance, or a combination of bothDesired  QualificationsAdvanced Microsoft Office skillsExcellent verbal, written, and interpersonal communication skillsStrong analytical skills. with high  attention to detail and accuracyAbility to interact, provide feedback/direction',
           'Min: $110,600 Mid: $158,000Street AddressNC-Charlotte: 301 S College St -  Charlotte, NCDisclaimerAll offers for employment with Wells Fargo, website: https://www.wellsfargo.com',
           'this sentnce has misspelled werds and combinedwords',
           'caresses care fly flies die dies died mules deny denied agree agreed own owned tradition traditional sensation sensational meet meeting plot plotted reference references'
          ]

**Test data explanation**  
Line 1, 2, 3 are actual jd snippet from database, there are combined words, special characters and websites  
Line 4 is for testing spell checker  
Line 5 is for testing stemmer  

In [18]:
test_jd

['Job DescriptionImportant Note: During the  application process, ensure your contact information (email and phone number)  is up to date. The invitation can be sent by both email and  text message. In order to receive text message invitations, your profile must  include a mobile phone number designated as “Personal Cell” or “Cellular” in  the contact information of your application.At Wells Fargo, we want to  satisfy our customers’ financial needs and help them succeed financially.  We’re looking for talented people who will put our customers at the center of  everything we do.',
 'This role will be based in Charlotte, but will consider other hub  locations.Required Qualifications10 + years of experience in compliance,  operational risk management(includes audit, legal, credit risk, market risk, or the management of a process or business with accountability for compliance or operational risk), or a combination of both; or 10 + years of IT systems  security, business process management

#### 4.1.1 - Word segmentation

In [19]:
# Word segmentation
test_jd_seg = segment_text_list(test_jd)

for seg in test_jd_seg:
    print(seg) 

['job', 'descriptionimportant', 'note', 'during', 'the', 'application', 'process', 'ensure', 'your', 'contact', 'information', 'email', 'and', 'phone', 'number', 'is', 'up', 'to', 'date', 'the', 'invitation', 'can', 'be', 'sent', 'by', 'both', 'email', 'and', 'text', 'message', 'in', 'order', 'to', 'receive', 'text', 'message', 'invitations', 'your', 'profile', 'must', 'include', 'a', 'mobile', 'phone', 'number', 'designated', 'as', '“personal', 'cell”', 'or', '“cellular”', 'in', 'the', 'contact', 'information', 'of', 'your', 'application', 'at', 'wells', 'fargo', 'we', 'want', 'to', 'satisfy', 'our', 'customers’', 'financial', 'needs', 'and', 'help', 'them', 'succeed', 'financially', 'we’re', 'looking', 'for', 'talented', 'people', 'who', 'will', 'put', 'our', 'customers', 'at', 'the', 'center', 'of', 'everything', 'we', 'do']
['this', 'role', 'will', 'be', 'based', 'in', 'charlotte', 'but', 'will', 'consider', 'other', 'hub', 'locations', 'required', 'qualifications10', 'years', 'of'

#### 4.1.2 - Correct misspelled word

In [20]:
cs = CheckSpell()

In [21]:
# Correct misspelled word

test_jd_seg_cs = cs.spell_correction(test_jd_seg)

for seg in test_jd_seg_cs:
    print(seg)

['job', 'descriptionimportant', 'note', 'during', 'the', 'application', 'process', 'ensure', 'your', 'contact', 'information', 'email', 'and', 'phone', 'number', 'is', 'up', 'to', 'date', 'the', 'invitation', 'can', 'be', 'sent', 'by', 'both', 'email', 'and', 'text', 'message', 'in', 'order', 'to', 'receive', 'text', 'message', 'invitations', 'your', 'profile', 'must', 'include', 'a', 'mobile', 'phone', 'number', 'designated', 'as', 'personal', 'cells', 'or', 'cellular', 'in', 'the', 'contact', 'information', 'of', 'your', 'application', 'at', 'wells', 'fargo', 'we', 'want', 'to', 'satisfy', 'our', 'customers', 'financial', 'needs', 'and', 'help', 'them', 'succeed', 'financially', 'were', 'looking', 'for', 'talented', 'people', 'who', 'will', 'put', 'our', 'customers', 'at', 'the', 'center', 'of', 'everything', 'we', 'do']
['this', 'role', 'will', 'be', 'based', 'in', 'charlotte', 'but', 'will', 'consider', 'other', 'hub', 'locations', 'required', 'qualifications', 'years', 'of', 'expe

In [22]:
# We can see some words are not corrected, combined word cannot be handled

print("***Unknown words before correction:" )
for word_list in cs.find_unknown(test_jd_seg):
    print(word_list)
    
print("\n***Unknown words after correction:" )
for word_list in cs.find_unknown(test_jd_seg_cs):
    print(word_list)

***Unknown words before correction:
['customers’', '“personal', 'cell”', '“cellular”', 'we’re', 'descriptionimportant']
['accuracyability', 'qualifications10', 'skillsstrong', 'skillsexcellent', 'bothdesired', 'qualificationsadvanced']
['ncdisclaimerall', 'www', 'website', '000street', 'https', 'addressnc', 'wellsfargo']
['sentnce', 'combinedwords', 'werds']
[]

***Unknown words after correction:
['descriptionimportant']
['accuracyability', 'skillsstrong', 'skillsexcellent', 'bothdesired', 'qualificationsadvanced']
['000street', 'ncdisclaimerall', 'wellsfargo']
['combinedwords']
[]


#### 4.1.3 - Keep pure words

In [23]:
test_jd_seg_cs_kp = keep_pure_text(test_jd_seg_cs)
test_jd_seg_cs_kp = keep_pure_text(test_jd_seg_cs) # It is a bug, have to run it twice to get it work


for seg in test_jd_seg_cs_kp:
    print(seg)

['job', 'descriptionimportant', 'note', 'during', 'the', 'application', 'process', 'ensure', 'your', 'contact', 'information', 'email', 'and', 'phone', 'number', 'is', 'up', 'to', 'date', 'the', 'invitation', 'can', 'be', 'sent', 'by', 'both', 'email', 'and', 'text', 'message', 'in', 'order', 'to', 'receive', 'text', 'message', 'invitations', 'your', 'profile', 'must', 'include', 'a', 'mobile', 'phone', 'number', 'designated', 'as', 'personal', 'cells', 'or', 'cellular', 'in', 'the', 'contact', 'information', 'of', 'your', 'application', 'at', 'wells', 'fargo', 'we', 'want', 'to', 'satisfy', 'our', 'customers', 'financial', 'needs', 'and', 'help', 'them', 'succeed', 'financially', 'were', 'looking', 'for', 'talented', 'people', 'who', 'will', 'put', 'our', 'customers', 'at', 'the', 'center', 'of', 'everything', 'we', 'do']
['this', 'role', 'will', 'be', 'based', 'in', 'charlotte', 'but', 'will', 'consider', 'other', 'hub', 'locations', 'required', 'qualifications', 'years', 'of', 'expe

#### 4.1.4 - Filter Stopwords

In [24]:
test_jd_seg_cs_kp_fs = filter_stopwords(test_jd_seg_cs_kp, stopwords)

for seg in test_jd_seg_cs_kp_fs:
    print(seg)

['job', 'descriptionimportant', 'note', 'application', 'process', 'ensure', 'contact', 'email', 'phone', 'invitation', 'email', 'text', 'message', 'receive', 'text', 'message', 'invitations', 'profile', 'mobile', 'phone', 'designated', 'personal', 'cells', 'cellular', 'contact', 'application', 'fargo', 'satisfy', 'customers', 'financial', 'succeed', 'financially', 'talented', 'people', 'customers', 'center']
['role', 'based', 'charlotte', 'hub', 'locations', 'required', 'qualifications', 'experience', 'compliance', 'operational', 'risk', 'management', 'includes', 'audit', 'legal', 'credit', 'risk', 'market', 'risk', 'management', 'process', 'business', 'accountability', 'compliance', 'operational', 'risk', 'combination', 'systems', 'security', 'business', 'process', 'management', 'financial', 'services', 'industry', 'experience', 'direct', 'experience', 'compliance', 'combination', 'bothdesired', 'qualificationsadvanced', 'microsoft', 'office', 'skillsexcellent', 'verbal', 'written', '

#### 4.1.5 - Stemming

In [25]:
test_jd_seg_cs_kp_fs_s =  stemer_starter(test_jd_seg_cs_kp_fs, 'Porter')

for seg in test_jd_seg_cs_kp_fs_s:
    print(seg)

['job', 'descriptionimport', 'note', 'applic', 'process', 'ensur', 'contact', 'email', 'phone', 'invit', 'email', 'text', 'messag', 'receiv', 'text', 'messag', 'invit', 'profil', 'mobil', 'phone', 'design', 'person', 'cell', 'cellular', 'contact', 'applic', 'fargo', 'satisfi', 'custom', 'financi', 'succeed', 'financi', 'talent', 'peopl', 'custom', 'center']
['role', 'base', 'charlott', 'hub', 'locat', 'requir', 'qualif', 'experi', 'complianc', 'oper', 'risk', 'manag', 'includ', 'audit', 'legal', 'credit', 'risk', 'market', 'risk', 'manag', 'process', 'busi', 'account', 'complianc', 'oper', 'risk', 'combin', 'system', 'secur', 'busi', 'process', 'manag', 'financi', 'servic', 'industri', 'experi', 'direct', 'experi', 'complianc', 'combin', 'bothdesir', 'qualificationsadvanc', 'microsoft', 'offic', 'skillsexcel', 'verbal', 'written', 'interperson', 'commun', 'skillsstrong', 'analyt', 'skill', 'attent', 'detail', 'accuracy', 'interact', 'provid', 'feedback', 'direct']
['min', 'mid', 'addre

#### Result

In [26]:
print("\nFeature count before stem: ", get_feature_count(test_jd_seg))
print("Feature count after stem: ", get_feature_count(test_jd_seg_cs_kp_fs_s))


Feature count before stem:  172
Feature count after stem:  98


### 4.2 - Test on entire dataset

In [None]:
#TODO