In [45]:
import pandas as pd
import random
import string
import numpy as np
import pickle
import os
import json
# import matplotlib.pyplot as plt

In [11]:
from gensim.models import Word2Vec

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from html.parser import HTMLParser
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [3]:
from enchant import Dict
dictionary = Dict("en_US")

In [4]:
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = list(set(stopwords.words('english')))
h = HTMLParser()

In [5]:
def lemmatize_and_stem(word) :
    """
    Function to lemmatization and stemming of a word based on dictionary checks
    """
    lemmatized = lemmatizer.lemmatize(word)
    if lemmatized != word : 
        if dictionary.check(lemmatized)==True : 
            return lemmatized
        else : 
            return porter.stem(word)
    else : 
        stemmed = porter.stem(word)
        if dictionary.check(stemmed) == True : 
            return stemmed
        else : 
            return word

In [7]:
def clean_text(text, broken_sentences=False) : 
    """
    Function to clean sentences. If broken_sentences=True break the paragraphs in text to get the single sentences
    in cleaned text

    Steps Done are
    1. html unescape 
    2. Remove Punctuations
    3. Lemmatization and Stemming
    4. Remove StopWords 
    
    """
    cleaned_text = [] 
    if type(text) != list : 
        text = [text] 
    for paragraph in text :  
        sentence_tokenized = sent_tokenize(paragraph)
        cleaned_sentences = [] 
        for t in sentence_tokenized : 
            html_escaped_chars = h.unescape(t)
            remove_punctuations = "".join([c for c in html_escaped_chars if not c in string.punctuation])
            words = remove_punctuations.split(" ")
            lemmatized_and_stemmed = [lemmatize_and_stem(word) for word in words if len(word)>0]
            stopwords_removed_words = [word for word in lemmatized_and_stemmed if not word in stop_words]
            final_sentence = " ".join(stopwords_removed_words)
            cleaned_sentences.append(final_sentence)
        if not broken_sentences : 
            cleaned_paragraph = ". ".join(cleaned_sentences)
            cleaned_text.append(cleaned_paragraph)
        else : 
            cleaned_text = cleaned_text + cleaned_sentences
    return cleaned_text

In [10]:
# load data 
data_dir = os.path.join(os.getcwd(), 'glassdoor_problem/data/')
unlabelled_data = pd.read_pickle(os.path.join(data_dir, 'unlabelled_data.pkl'))
labelled_data = pd.read_pickle(os.path.join(data_dir, 'labelled_data.pkl'))

In [12]:
# clean all the sentences from the entire data
all_lines = unlabelled_data.pros.tolist() + unlabelled_data.cons.tolist() + labelled_data.pp_sent.tolist()
_ = random.shuffle(all_lines)
cleaned_sentences = clean_text(all_lines, broken_sentences=True)



In [13]:
# define functions helpful to get the word2vec model and build the model and save it for using while training
def tokenize_sentences_for_model(sentence) : 
    return word_tokenize(sentence)

def form_word2vec_model(all_sentences, tokenized, window=5, min_count=1, workers=4, sg=1) : 
    """
    Given 
    all_sentences : List of sentences
    tokenized : True or False, if False the word tokenization of sentences will be done within the function otherwise not
    """
    if tokenized : 
        model = Word2Vec(all_sentences, window=window, min_count=min_count, workers=workers, sg=sg)
    else : 
        all_sentences = [tokenize_sentences_for_model(sentence) for sentence in all_sentences]
        model = Word2Vec(all_sentences, window=window, min_count=min_count, workers=workers, sg=sg)
    return model

wordvec_model = form_word2vec_model(cleaned_sentences, tokenized=False)
_ = wordvec_model.save(os.path.join(os.getcwd(), 'glassdoor_problem/wordvecmodel'))

In [47]:
label_map

{'salary_benefits': 0,
 'wlb_working_conditions': 1,
 'tech_product': 2,
 'culture_team': 3,
 'Job Security/Advancement': 4,
 'haras_discrim_sexism': 5,
 'management': 6,
 'business_vision_competitors': 7}

In [19]:
# PROCESS ONE LABEL DATA
one_label_data = labelled_data[labelled_data.label.apply(lambda x: len(x)==1)]
one_label_data.label = one_label_data.label.apply(lambda x: x[0])
all_labels = one_label_data.label.value_counts().index.tolist()

# An index for each label. This will be used later
label_map = {}
for i in range(len(all_labels)) : 
    label = all_labels[i]
    label_map[label] = i

with open(os.path.join(os.getcwd(), 'glassdoor_problem/label_map.pickle'), 'wb') as f: 
    pickle.dump(label_map, f)

#A dictionary with one label sentences for fetching it directly while creating training data
basic_preprocess_labeldata = {} 
for label in all_labels : 
    basic_preprocess_labeldata[label] = clean_text(one_label_data[one_label_data.label==label].pp_sent.tolist())



In [20]:
#PROCESS MULTILABEL DATA
multilabel_data = labelled_data[labelled_data.label.apply(lambda x: len(x)>1)]
multilabel_data.label = multilabel_data.label.apply(lambda x: tuple(x))
multilabel_data = multilabel_data.drop_duplicates()
multilabel_data.label = multilabel_data.label.apply(lambda x: list(x))

In [52]:
# Create Training Data from the processed unilabel and multilabel sentences above
# 500 sentences have been chosen for both validation and test, to keep training set around 80% of entire data
def create_training_data_multilabel() :     
    entire_training_data = []
    test_data = []
    
    # unilabel data
    unilabel_data = [] 
    for label in basic_preprocess_labeldata : 
        label_index = label_map[label]
        given_label_sentences = basic_preprocess_labeldata[label]
        given_label_sentences = list(set(given_label_sentences))
        ############################EIGHT TIMES SCALING BELOW###########################
        train_label = [(sen, 8*np.eye(8)[label_index]) for sen in given_label_sentences]
        unilabel_data = unilabel_data + train_label
    
    _ = random.shuffle(unilabel_data)
    test_unilabel = unilabel_data[-500:]
    validation_unilabel = unilabel_data[-1000:-500]
    train_unilabel = unilabel_data[0:-1000]
    
    #multilabel data
    multilabeldata = []
    cleaned_multilable_messages = clean_text(multilabel_data.pp_sent.tolist())
    for i in range(multilabel_data.shape[0]) :         
        message = cleaned_multilable_messages[i]
        labels = multilabel_data.label.iloc[i]
        resulting_classification_vec = np.zeros(8)[0]
        for label in labels : 
            label_index = label_map[label]
            ############################EIGHT TIMES SCALING BELOW###########################
            resulting_classification_vec = resulting_classification_vec + 8*np.eye(8)[label_index]
            multilabeldata.append((message, resulting_classification_vec))
            
    _ = random.shuffle(multilabeldata)
    test_multilabel = multilabeldata[-30:]
    validation_multilabel = multilabeldata[-60:-30]
    train_multilabel = multilabeldata[0:-60]
    
    all_train_data = train_unilabel + train_multilabel
    _ = random.shuffle(all_train_data)
    
    all_validation_data = validation_unilabel + validation_multilabel
    _ = random.shuffle(all_validation_data)
    
    all_test_data = test_unilabel + test_multilabel
    _  = random.shuffle(all_test_data)
    
    return all_train_data, all_validation_data, all_test_data

trd, vad, ted = create_training_data_multilabel()



In [32]:
data_dir = os.path.join(os.getcwd(), 'glassdoor_problem')
with open(os.path.join(data_dir, 'training_data.pkl'), 'wb') as f: 
    pickle.dump(trd, f)

with open(os.path.join(data_dir, 'validation_data.pkl'), 'wb') as f: 
    pickle.dump(vad, f)
    
with open(os.path.join(data_dir, 'test_data.pkl'), 'wb') as f: 
    pickle.dump(ted, f)

In [53]:
#example
trd[0]

('benefit decent compared job market', array([0., 0., 0., 0., 0., 0., 0., 8.]))