# Data Preprocessing

## 1 - Basic Analysis

### 1.1 - Import Packages

In [1]:
import re
import pandas as pd
from spellchecker import SpellChecker
# import tensorflow as tf
# from tensorflow import keras

from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer

from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

### 1.2 - Reading Data

In [2]:
# Read data from local pathway

project_path = "/Users/apple/Documents/Xinzhi/Data/"
data = pd.read_csv(project_path + "meta_training_data_jd_0401.csv")

data.head(3)

Unnamed: 0,Type,Code,fun1_code,fun2_code,jobname_ch,jobname_en,id,coname,poname,jd,source
0,FUN,BRO,BRO,BRO,商业研究,Business Research,279011,Aldevron,"Strategic Account Manager, Nucleic Acids Busin...",\n\nWe are currently seeking a Strategic Accou...,monster
1,FUN,BRO,BRO,BRO,商业研究,Business Research,287601,Synergy America,IT Business Analyst,\n\nWe are looking for an IT Business Analyst ...,monster
2,FUN,BRO,BRO,BRO,商业研究,Business Research,340361,A3 Smart Home,Sr. Business Analyst,"\n\nWe call our club's vision, mission, values...",monster


In [3]:
# get stop words

def get_stopwords(words_file):
    stopwords = []
    with open(words_file, 'r') as file:
        for word in file.readlines():
            stopwords.append(word.strip('\n'))
    
    return stopwords


stopwords = get_stopwords("stop_words.txt")

### 1.3 - Data Exploration

In [4]:
print("The number of tier 1 job title: ", len(data.fun1_code.unique()))
print("The number of tier 2 job title: ", len(data.Code.unique()))

The number of tier 1 job title:  97
The number of tier 2 job title:  709


In [5]:
# Class 'Code' that only have 1 instance
jd_count = data.groupby('Code').count()
jd_count[jd_count['jd']==1]

Unnamed: 0_level_0,Type,fun1_code,fun2_code,jobname_ch,jobname_en,id,coname,poname,jd,source
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AFB080,1,1,1,1,1,1,1,1,1,1
AFY660,1,1,1,1,1,1,1,1,1,1
AHR165,1,1,1,1,1,1,1,1,1,1
AMS900,1,1,1,1,1,1,1,1,1,1
APM080,1,1,1,1,1,1,1,1,1,1
APM900,1,1,1,1,1,1,1,1,1,1
AVO060,1,1,1,1,1,1,1,1,1,1
BNV,1,1,1,1,1,1,1,1,1,1
YHF020,1,1,1,1,1,1,1,1,1,1


### 1.4 - Test Data

In [6]:
# Get the list of jds
jd_list = data.jd.tolist()

In [None]:
# test_text

jd1 = data.jd[20698]
jd2 = data.jd[20798]
test_jd = [jd1, jd2]

## 2 - Tokenization

### 2.1 - Word segmentation

In [7]:
# Test base: `text_to_word_sequence`
def segment_text_list(text_list):
    text_list_segmented = [text_to_word_sequence(text) for text in text_list]
    
    return text_list_segmented

### 2.2 - Keras Tokenizer

In [8]:
class TextTokenizer():
    def __init__(self, texts=None):
        self.texts = texts   
        self.tokenizer = Tokenizer()
        if texts is not None:
            self.tokenizer.fit_on_texts(texts)
            
#         tokenizer.word_counts
#         tokenizer.word_docs
#         tokenizer.word_index
#         tokenizer.document_count
        
    def train(self, train_text):
        self.tokenizer.fit_on_texts(train_text)

    def to_sequences(self, in_text=None):
        if in_text is None:
            return self.tokenizer.texts_to_sequences(self.texts)
        else:
            return self.tokenizer.texts_to_sequences(in_text)
    
    def max_length(self):
        sequences = self.to_sequences()
        lenth = []
        for i in sequences:
            lenth.append(len(i))
        mx_lenth = max(lenth)
        
        print(lenth.index(mx_lenth), mx_lenth)
    
    
    def max_length2(self):  
        sequences = self.to_sequences()
        lenth = 0
        for i in sequences:
            if len(i) > lenth:
                lenth = len(i)
                idx = sequences.index(i)
            else:
                continue

        print(idx, lenth)
    

In [9]:
def get_feature_count(text_list_segmented):
    tk = TextTokenizer(text_list_segmented)
    feature_count = len(tk.tokenizer.word_counts)
    
    return feature_count

## 2 - Cleaning

In [None]:
test_2 = ["Data science is an inter-disciplinary field to extract knowledge and insights from data.",
             "Data science is _related to data mining and big data 666 555teststring.",
             "Data science unifies statistics, __data analysis, and machine learning."]

### 2.1 - Correct misspelled words

In [10]:
def find_unknown(text_list_segmented):
    spell = SpellChecker()
    unkonwn_words = [list(spell.unknown(word_list)) for word_list in text_list_segmented]
    
    return unkonwn_words

In [11]:
def spell_correction(text_list_segmented):
    spell = SpellChecker()
    corrected = [[spell.correction(word) for word in word_list] for word_list in text_list_segmented]
    
    return corrected

### 2.2 - Keep pure words

In [12]:
# clean the segmented text list, keep only pure english words        

def keep_pure_text(text_list_segmented):
    for word_list in text_list_segmented:
        for word in word_list:
            match = re.findall(r'^[a-z]+$', word)
            if match:
                pass
            else: 
                word_list.remove(word)
    
    return text_list_segmented

### 2.3 - Remove Stopwords

In [13]:
def filter_stopwords(text_list_segmented, stop_words):
    filtered_words = [[word for word in word_list if word not in stop_words]
                      for word_list in text_list_segmented]

    return filtered_words

### 2.4 - Stemming

In [14]:
def stemer_starter(stemmer, text_list_segmented):
    result = [[stemmer.stem(word) for word in word_list] for word_list in text_list_segmented]
    
    return result

In [15]:
def test_stem(stemmer, text_list_segmented):
    print("Before stem: ", get_feature_count(text_list_segmented))
    stemmed = stemer_starter(stemmer, text_list_segmented)
    print("After stem: ", get_feature_count(stemmed))

In [16]:
stem_test_text = [['caresses', 'flies', 'dies', 'mules', 'denied',
                   'died', 'agreed', 'owned', 'humbled', 'sized',
                   'meeting', 'stating', 'siezing', 'itemization',
                   'sensational', 'traditional', 'reference', 'colonizer',
                   'plotted'],
                  ['try', 'tries', 'tring', 'apple', 'apples', 'watch', 'watches',
                   'teeth', 'tooth', 'foot', 'feet']]

In [17]:
ps = PorterStemmer()
ss = SnowballStemmer("english")

In [18]:
print(stemer_starter(ps, stem_test_text))

[['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'plot'], ['tri', 'tri', 'tring', 'appl', 'appl', 'watch', 'watch', 'teeth', 'tooth', 'foot', 'feet']]


In [19]:
print(stemer_starter(ss, stem_test_text))

[['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'plot'], ['tri', 'tri', 'tring', 'appl', 'appl', 'watch', 'watch', 'teeth', 'tooth', 'foot', 'feet']]


In [20]:
test_stem(ss, stem_test_text)

Before stem:  30
After stem:  26


In [21]:
test_stem(ps, stem_test_text)

Before stem:  30
After stem:  26
