In [1]:
""" Import Necessary Modules 
"""
import pandas as pd
import numpy as np
import nltk #Natural languange Processing tool
nltk.download('punkt')
from nltk.stem.lancaster import LancasterStemmer #model for stemming words
stemmer = LancasterStemmer()

""" Reading the dataset and processing the data
"""
df = pd.read_excel('/content/Omdena Chatbot Dataset.xlsx') # chenge path to the location of your file
df.where(df['Pattern']!= None, df['Tag'].fillna(method='ffill', inplace=True)) # Fill missing values in the tag column

""" Cleaning the pattern column
"""
#A Function for cleaning the file (The Pattern column in it)
def text_clean(df):
    #Lowercasing all the letters
    df['Pattern'] = df['Pattern'].str.lower()
    
    #Removing punctuations and replacing with a single space
    df['Pattern'] = df['Pattern'].str.replace(r'[()!?]', ' ', regex=True)
    df['Pattern'] = df['Pattern'].str.replace(r'\[.*?\]', ' ', regex=True)
    
    #Filtering non-alphanumeric characters
    df['Pattern'] = df['Pattern'].str.replace(r'[^a-z0-9]', ' ', regex=True)

text_clean(df)

""" Extracting features from the data
"""
# First, we setup blank variable to hold the features we need.
ChatVocab = [] # to hold tokenized unique words of sentences in patterns
labels = [] # to hold unique tag names for encoding purposes.
docs_X = [] # to hold tokenized list of sentence patterns 
docs_y = [] # to hold a list of labels associated with docs_X list

# Looping through the words in pattern column as we tokenize them
for pattern in df.Pattern:
    tokenized_words = nltk.word_tokenize(pattern)
    ChatVocab.extend(tokenized_words)
    docs_X.append(tokenized_words)

#Loop through the tags building the output
for label in df.Tag:
    docs_y.append(label)

labels = sorted(set(docs_y)) #Getting labels for encoding

"""Creating root words for our Chatbot Vocabulary
"""
#creating a list of root words using our earlier imported stemmer from nltk
ChatVocab = [stemmer.stem(word.lower()) for word in ChatVocab]
ChatVocabulary = sorted(list(set(ChatVocab)))


""" Data encoding using bag of words and one hot encoding
    Treat the unique words in ChatVocabulary as columns. Stem the words in docs_X variable and represent them as rows by 
    putting a numeric number ' 1 ' where the word in row is inline with the word on column and a " 0 " otherwise.
"""
#First, setup blank variables to hold training and output data
train_matrix_list = []
output_matrix_list = []

#second, create a list of zeros the length = labels for use in the next step
output_empty_label = [0 for _ in range(len(labels))]

# Third, loop through docs_X, stem each list, 
# Use the second for loop to build a list of length = len(ChatVocabulary)
# use if statement to check whether the word is in stemmed word of docs_X

for number, each_list in enumerate(docs_X):
    bow = [] #Bag of Words
    stemmed_words = [stemmer.stem(word.lower()) for word in each_list]
    
    for vocab_word in ChatVocabulary:
        if vocab_word in stemmed_words:
            bow.append(1)
        else:
            bow.append(0)
        
        output_column = output_empty_label[:] #make a copy of the earlier zero list
        output_column[labels.index(docs_y[number])] = 1 # set the zero list to 1 for each value of docs_y
        
        train_matrix_list.append(bow) # building training matrix
        output_matrix_list.append(output_column) # building output / predicted class

#Fourth, convert the train_matrix_list and output_matrix_list into numpy arrays
training_data = np.array(train_matrix_list)
output_data = np.array(output_matrix_list)

#Visualizing the output
print(training_data.shape)
print(output_data.shape)


ModuleNotFoundError: No module named 'pandas'