## Kiambu Kenya Chatbot for Students
### Code for Task - 1 - Data cleaning and processing
Change the file path by pointing it to the location of your data


In [1]:
""" Import necessary file modules
"""
import json #json module for reading json files
import numpy as np #tool for creating data arrays 
import nltk #Natural languange Processing tool
from nltk.stem.lancaster import LancasterStemmer #model for stemming words
stemmer = LancasterStemmer()

""" Read the json data and storing it in a variable named data
"""
with open("job_intents.json") as file: #change the file path to point to the location of your file
    data = json.load(file)

""" STEP - 1: Feature Extraction from the Json file
"""
# First, we setup blank variable to hold the features we need.
ChatVocab = [] # to hold tokenized unique words of sentences in patterns
labels = [] # to hold unique tag names for encoding purposes.
docs_X = [] # to hold tokenized list of sentence patterns 
docs_y = [] # to hold a list of labels associated with docs_X list

# Second, Loop through the patterns as you tokenize the sentences
for intent in data['intents']:
    for pattern in intent['patterns']:
        tokenized_words = nltk.word_tokenize(pattern)
        ChatVocab.extend(tokenized_words) # Building a list of tokenized words
        docs_X.append(tokenized_words)
        docs_y.append(intent['tag'])
    
    # create class labels
    if intent['tag'] not in labels:
        labels.append(intent['tag']) # Building a list of tag names - output labels
        
# Third, create a list of root words using stemmer from nltk as imported
ChatVocab = [stemmer.stem(word.lower()) for word in ChatVocab if word != "?"] # filter out question marks
ChatVocabulary = sorted(list(set(ChatVocab)))


""" STEP - 2: Data encoding using bag of words and one hot encoding
    Treat the unique words in ChatVocabulary as columns. Stem the words in docs_X variable and represent them as rows.
    Put a numeric number ' 1 ' where the word in row is inline with the word on column and a " 0 " otherwise.
"""
#First, setup blank variables to hold training and output data
train_matrix_list = []
output_matrix_list = []

#second, create a list of zeros the length = labels for use in the next step
output_empty_label = [0 for _ in range(len(labels))]

# Third, loop through docs_X, stem each list, 
# Use the second for loop to build a list of length = len(ChatVocabulary)
# use if statement to check whether the word is in stemmed word of docs_X
for number, each_list in enumerate(docs_X):
    bow = [] #Bag of Words
    stemmed_words = [stemmer.stem(word.lower()) for word in each_list if word != "?"]
    
    for vocab_word in ChatVocabulary:
        if vocab_word in stemmed_words:
            bow.append(1)
        else:
            bow.append(0)
    
    output_column = output_empty_label[:] #make a copy of the earlier zero list
    output_column[labels.index(docs_y[number])] = 1 # set the zero list to 1 for each value of docs_y
    
    train_matrix_list.append(bow) # build training matrix
    output_matrix_list.append(output_column) # build output / predicted class

#Fourth, convert the train_matrix_list and output_matrix_list into numpy arrays
training_data = np.array(train_matrix_list)
output_data = np.array(output_matrix_list)

### Visualizing Training and Output data

In [2]:
training_data.shape

(133, 101)

In [3]:
output_data.shape

(133, 15)

In [4]:
training_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
output_data

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])