# Assignment 5: Text Preprocessing & Train/Test Data Split

## Import Libraries

In [1]:
import numpy as np
import random
import nltk
import pickle
from Assignment5_Intent2 import Intents
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

## Text Augmentation

In [2]:
def random_deletion_words(words, p):

    #obviously, if there's only one word, don't delete it
    if (len(words) == 1) or (len(words) == 0):
        return words
    
    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

## Text Preprocessing

In [3]:
# Initialize lists
words = []
classes = []
documents = []

# Count Pattern Distribution
intent_with_pattern_cnt = [len(IntentInfo["patterns"]) for IntentInfo in Intents]
max_cnt = max(intent_with_pattern_cnt)

for IntentInfo in Intents:
    # Initialize lists
    extracted_documents = []
    aug_documents = []
    for pattern in IntentInfo['patterns']:
        ## Text Preprocessing
        # Lowercasing & White-Space Removal
        pattern = pattern.lower().strip()
        # Punctuation Removal
        alphanum_pattern = ''.join([char if char.isalnum() else ' ' for char in pattern])
        # Tokenization
        tokenized_pattern = word_tokenize(alphanum_pattern)
        # Stopword Removal
        stop_words = set(stopwords.words("english"))
        filtered_pattern = [word for word in tokenized_pattern if word not in stop_words]
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_pattern = [lemmatizer.lemmatize(word) for word in filtered_pattern]      
        
        # Adding words, documents & classes
        words.extend(lemmatized_pattern)
        documents.append((lemmatized_pattern, IntentInfo['intent']))
        if IntentInfo['intent'] not in classes:
            classes.append(IntentInfo['intent'])
            
        # Store current pre-process text for text augmentation
        extracted_documents.append(lemmatized_pattern)
        
    ## Text Augmentation
    # Calculate numbers of augmented data samples to solve uneven pattern distribution
    data_aug_cnt = max_cnt - len(IntentInfo["patterns"])
    intent = IntentInfo["intent"]
    # Perform augmentation through WORD RANDOM DELETION
    if data_aug_cnt > 0:
        for _ in range(data_aug_cnt):
            deleted_pattern = random.choice(extracted_documents)
            new_pattern = random_deletion_words(deleted_pattern, 0.5)
            aug_documents.append((new_pattern, IntentInfo['intent']))
    # Add to documents
    documents.extend(aug_documents)

## Text Preprocessing
# Duplicate Removal
words = list(set(words))

In [4]:
intent_with_pattern_cnt

[468, 1492]

## Generate Training & Testing Data

In [5]:
# Initialize all feature & outcome data
X_data = np.empty((len(documents), len(words)))
Y_data = np.empty((len(documents), len(classes)))

for i,doc in enumerate(documents):
    # Initialize feature data --> Bag of Words (Text Embedding)
    bag = []
    # Extract pattern tokens
    pattern = doc[0]
    # Generate BoW
    for word in words:
        if word in pattern:
            bag.append(1)
        else:
            bag.append(0)
    
    # Initialize & Generate outcome data --> One Hot Encoding
    output_row = list([0] * len(classes))
    output_row[classes.index(doc[1])] = 1
     
    # Gather all data in BoW & OHE
    X_data[i] = bag 
    Y_data[i] = output_row

# Shuffle training data --> reduce bias
random.shuffle(X_data)
random.shuffle(Y_data)

# Train/Test data split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=1)

## Save Data

In [6]:
data = {
    'X_train': X_train,
    'X_test': X_test,
    'Y_train': Y_train,
    'Y_test': Y_test
}

with open('train_test_data.pickle', 'wb') as f:
    pickle.dump(data, f)
    
with open('words.pickle', 'wb') as f:
    pickle.dump(words, f)

with open('classes.pickle', 'wb') as f:
    pickle.dump(classes, f)