Import needed packages

In [154]:
import re
import csv
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

Function maps tags to pos in lemmetizing

In [120]:
# WordNet only cares about 5 parts of speech.
# The other parts of speech will be tagged as nouns.

part = {
    'N' : 'n',
    'V' : 'v',
    'J' : 'a',
    'S' : 's',
    'R' : 'r'
}

def convert_tag(penn_tag):
    '''
    convert_tag() accepts the **first letter** of a Penn part-of-speech tag,
    then uses a dict lookup to convert it to the appropriate WordNet tag.
    '''
    if penn_tag in part.keys():
        return part[penn_tag]
    else:
        # other parts of speech will be tagged as nouns
        return 'n'

Function that do all the pre-processing on given data
- set all characters to lowercase
- apply word tokenize
- remove puncituation from the list of words
- filter out the stop words
- Annotate filtered words
- lemmetize words using tags
- return final lemmetization resultant words 

In [121]:
# clean input data and return output in data-fram formate
def cleanData (senten):
    ## make all characters lower case
    senten = senten.lower()
    
    ## Word Tokenize current sample question
    words_pun = word_tokenize(senten) ## used one
    #print ("words list with punc: ", words_pun)
    
    ## Remove punctuation after word tokenizing
    punct = set(string.punctuation)
    #print(punct)
    real_words= [x for x in words_pun if x not in punct]
    #print( "real_words without punctuation : ",real_words)
    
    ## Remove all common stop words in english language
    stop_words = set(stopwords.words("english"))
    filtered_words = [w for w in real_words if not w in stop_words]
    #print("filtered_words : ",filtered_words)
    
    ## Annotate words (get tags for filtered_words)
    words_tags = nltk.pos_tag(filtered_words)
    #print ("words_tags : ",words_tags)
    
    ## Lemmetizing Words
    lemmatizer = WordNetLemmatizer()
    words_lemms = []
    for (w,t) in words_tags:
        #print (w,convert_tag(t[0]))
        words_lemms.append(lemmatizer.lemmatize(w,pos=convert_tag(t[0])))
    #print("words_lemms : ", words_lemms)
    return (words_lemms)


## Here is the Main body of the file

### Pre-processing

- First Lets load the Data 

In [171]:
data_set = csv.reader(open('NBB_intents.csv'), delimiter=',',quoting=csv.QUOTE_NONE,skipinitialspace=True)
# creat new list for data after re-shaping
reshaped_dataset = []

- Clean and reshape input data

In [172]:
# read all questions samples 
for row in data_set:
    curr_senten = row
    # new set ["intent",["cleaned question words"]]
    reshaped_dataset.append([curr_senten[1],cleanData(curr_senten[0])])
    

- View sample of re-shaped data 

In [173]:
print("record 1 : ",reshaped_dataset[0])
print("record 1, part 2 : ",reshaped_dataset[0][1])

record 1 :  ['about', ['tell', 'co-creation', 'room']]
record 1, part 2 :  ['tell', 'co-creation', 'room']


> Now, Lets do simple classification for sentence 

## Simple questions classifier 

In This step we will do very naive classifier, that classify the question into direct and indirect questions.
- In which `direct` questions contain word == `intent` name
- While, `Indirect` question dont.

> Hint : `Direct` question will be labelled by `d` , while `indirect` will be labbeled with `i`

In [190]:
def classifyQuestion (ques):
    # partition input question to (intent,[body])
    ques_intent = ques[0].lower()
    ques_body = ques[1]
    
    #print("ques_intent:",ques_intent)
    #print("ques_body:",ques_body)
    
    ## tag and lemm the intent first
    # !! tagging for single word issue !!
    
    #ques_intent_tag = nltk.pos_tag(ques_intent)
    #print ("ques_intent_tag : ",ques_intent_tag)
    
    lemmatizer = WordNetLemmatizer()
    ques_intent_lemms =lemmatizer.lemmatize(ques_intent)
    #ques_intent_lemms =lemmatizer.lemmatize(ques_intent_tag[0],pos=convert_tag(ques_intent_tag[1][0]))
    #print("ques_intent_lemms : ", ques_intent_lemms)
    
    ## search in body list for word matches intent
    
    # build regular expression
    reg_search_str = '.*(' + ques_intent_lemms + ').*'
    #print("reg_search_str: ",reg_search_str)
    
    regex=re.compile(reg_search_str)
    #print("regex : ",regex)
    
    # save matches results in var
    matched = [m.group(0) for l in ques_body for m in [regex.search(l)] if m]
    #print("matched:",matched)
    
    ## check if var not empty
    emptyList = [] # create empty list for checking what I have
    if matched != emptyList : 
        label = 'd' # direct
    else :
        label = 'i' # indirect
    ## Check lable
    #print ("label",label)

    # Return Lable
    return label

- Apply the classifier and Update data with resultant lable 

In [200]:
# create new empty data set to carry labelled data
labelled_dataste = []

# loop on data
for i in new_dataset:
    curr_lable = classifyQuestion(i)
    # check if already been labelled
    if len(i) < 3 :
        #update current sample
        i.append(curr_lable)
        
    # Fill new labelled set
    labelled_dataste.append(i)
    
print (labelled_dataste[0])

['about', ['tell', 'co-creation', 'room'], 'i']


# Section for fast testing

In [159]:
ques_intent_lemms = 'dog'
reg_search_str = '.*(' + ques_intent_lemms + ').*'
reg_search_str

'.*(dog).*'