In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
!python3 -m spacy download en
## Read file
file_name = 'Clusters_all_data/GL_clusters_final_cat.csv'
## Read file using pandas
df = pd.read_csv(file_name)
df=df[["Cleaned","Final Category"]]
df.columns=["Message","Intent"]

In [3]:
# load nltk's English stopwords as variable called 'stop' and don't find synonym of those words.
stop = nltk.corpus.stopwords.words('english')

## Tokenizing sentence into token for finding synonym.
def make_tokenizer(texts):
    from keras.preprocessing.text import Tokenizer
    t = Tokenizer()
    t.fit_on_texts(texts)
    return t

tokenizer = make_tokenizer(df['Message'])    ## Message is column name

X = tokenizer.texts_to_sequences(df['Message'])

from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, 70)

Using TensorFlow backend.


In [4]:
## Dictionary of word index
index_word = {}
for word in tokenizer.word_index.keys():
    index_word[tokenizer.word_index[word]] = word
    
## word list
words = [value for key, value in index_word.items()]

## Function to find synonym of words 
import spacy
nlp = spacy.load('en', parser=False)
def check_lemma(t,w) :
    r = [d for d in t if (nlp(d.text)[0].lemma_ != nlp(w.text)[0].lemma_)]
    return r

def get_word_synonym(word):
  filtered_words = [w for w in word.vocab if (not w.lower_ in stop) and w.is_lower == word.is_lower and w.prob >= -15] ## (not w.lower_ in stop) and
  similarity = sorted(filtered_words, key=lambda w: word.similarity(w), reverse=True)
  filtered_similarity = check_lemma(similarity[:30], word)
  return filtered_similarity[:3]

## Synonym dictionary
synonym_dict = {}

for word in words:
        synonym_dict.update({word : tuple([w.lower_ for w in get_word_synonym(nlp.vocab[word])])})
        #print(word, " : ", [w.lower_ for w in get_word_synonym(nlp.vocab[word])])
        
## Only consider filtered synonym
import collections
value_occurrences = collections.Counter(synonym_dict.values())

filtered_synonym = {key: value for key, value in synonym_dict.items() if value_occurrences[value] == 1}

In [12]:
## Function for augmenting data by replacing words with synonym using spaCy
## This might not be best best method compared to data augmentation using language translation
import re
import random
sr = random.SystemRandom()
split_pattern = re.compile(r'\s+')
def data_augmentation(message, aug_range=1) :
    augmented_messages = []
    for j in range(0,aug_range) :
        new_message = ""
        for i in filter(None, split_pattern.split(message)) :
            new_message = new_message + " " + sr.choice(filtered_synonym.get(i,[i]))
        augmented_messages.append(new_message)
    return augmented_messages

## Dictionary for intent count
## Intent is column name
intent_count = df.Intent.value_counts().to_dict()

## Get max intent count to match other minority classes through data augmentation
import operator
max_intent_count = max(intent_count.items(), key=operator.itemgetter(1))[1]

In [15]:
## Loop to interate all messages
import numpy as np
import math
import tqdm
newdf = pd.DataFrame()
for intent, count in intent_count.items() :
    count_diff = max_intent_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in tqdm.tqdm(df[df["Intent"] == intent]["Message"]) :
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['Message'])
            dummy1["Intent"] = intent
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_augmentation(message,  multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['Message'])
            dummy2["Intent"] = intent
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(df[df["Intent"] == intent])

100%|██████████| 888/888 [00:02<00:00, 420.02it/s]
100%|██████████| 834/834 [00:02<00:00, 410.52it/s]
100%|██████████| 772/772 [00:01<00:00, 438.11it/s]
100%|██████████| 714/714 [00:01<00:00, 433.44it/s]
100%|██████████| 484/484 [00:01<00:00, 395.52it/s]
100%|██████████| 403/403 [00:01<00:00, 394.31it/s]
100%|██████████| 169/169 [00:00<00:00, 407.37it/s]
100%|██████████| 141/141 [00:00<00:00, 330.77it/s]
100%|██████████| 91/91 [00:00<00:00, 224.45it/s]
100%|██████████| 73/73 [00:00<00:00, 199.84it/s]


In [17]:
newdf

Unnamed: 0,Message,Intent
4569,claim for damage to gas station pump insured w...,Accidental
4570,claim for damage to parking deck insured struc...,Accidental
4571,claim for damage to commercial property stage ...,Accidental
4572,claim for damage to motel 6 insured struck ove...,Accidental
4573,commercial property claim for marketing materi...,Accidental
...,...,...
1,claim a f fire in customer's home.,Fire
0,claim arising from fire that broke out at ins...,Fire
0,auto liability claim for damage to fire hydra...,Fire
14,claim alleging fire in unit. allegations on o...,Fire


In [18]:
## Print count of all new data points
newdf.to_csv("GL Datasets/Upsampled_GL_AutoML.csv",index=False)