#Intent Classifier

In [0]:
#All the required imports.

import pandas as pd
import numpy as np
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [75]:
#Code for uploading the CSV file on Google Colab.
##when asked for please upload the 'TakeHome_task_data.csv' file here.

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving TakeHome_task_data.csv to TakeHome_task_data (2).csv
User uploaded file "TakeHome_task_data.csv" with length 181157 bytes


##Data Preperation

###Raw Data

In [0]:
#Reading the Dataset file

I_data = pd.read_csv('TakeHome_task_data.csv')

In [77]:
I_data.head(n =10)

Unnamed: 0,data,label
0,"{'id': 'KG0OUA', 'data': 'Good morning', 'mess...",location
1,"{'id': 'L9DC9H', 'data': 'Location', 'message_...",whoAreYou
2,"{'id': 'ZQR6R5', 'data': 'hi', 'message_order'...",whoAreYou
3,"{'id': 'RH0M4E', 'data': 'Hi', 'message_order'...",greeting
4,"{'id': 'WLVX8I', 'data': 'Hello', 'message_ord...",greeting
5,"{'id': 'M2IRWL', 'data': 'sir', 'message_order...",whoAreYou
6,"{'id': '5MXRNL', 'data': 'K', 'message_order':...",whoAreYou
7,"{'id': '609W8Y', 'data': 'On thanks', 'message...",dontMeetRequirements
8,"{'id': 'T3V6IY', 'data': 'Hii', 'message_order...",dontMeetRequirements
9,"{'id': 'GI07N1', 'data': 'Sir I dnt have two w...",whoAreYou


In [78]:
I_data.shape #gives tuple of 'rows x columns' for 'I_data' dataframe.

(2000, 2)

In [79]:
I_data.describe()

Unnamed: 0,data,label
count,2000,2000
unique,2000,5
top,"{'id': 'MO79JO', 'data': 'but I have no bike',...",whoAreYou
freq,1,627


In [0]:
dataset = I_data

In [0]:
#Extracting Important Features from the given Dataset, that is TakeHome_task_data.csv

#creating a new feature 'id'! with all the values from the hash mapping.
dataset['id'] = dataset['data'].str.findall(r'id\': \'([A-Za-z0-9]+)\'').apply(np.squeeze)

#creating a new feature 'text'! with all the values from the hash mapping.
dataset['text'] = dataset['data'].str.findall(r'\'data\': \'(.+)\', \'message_order\'').apply(np.squeeze)
problem_indices = dataset[dataset['text'].apply(len) == 0].index.values
dataset.loc[problem_indices, 'text'] = dataset.loc[problem_indices, 'data'].str.findall(r'"(.+)"').apply(np.squeeze)

#creating a new feature 'message_order'! with all the values from the hash mapping.
dataset['message_order'] = dataset['data'].str.findall(r'\'message_order\': ([\d]+)').apply(np.squeeze)

#creating a new feature 'comments'! with all the values from the hash mapping.
dataset['comments'] = dataset['data'].str.findall(r'\'comments\': \[(.+)\]').apply(np.squeeze)

###Processed Data

In [82]:
#a view at the new altered dataframe called 'dataset'!!

dataset.head(n=20)

Unnamed: 0,data,label,id,text,message_order,comments
0,"{'id': 'KG0OUA', 'data': 'Good morning', 'mess...",location,KG0OUA,Good morning,2,''
1,"{'id': 'L9DC9H', 'data': 'Location', 'message_...",whoAreYou,L9DC9H,Location,5,''
2,"{'id': 'ZQR6R5', 'data': 'hi', 'message_order'...",whoAreYou,ZQR6R5,hi,5,''
3,"{'id': 'RH0M4E', 'data': 'Hi', 'message_order'...",greeting,RH0M4E,Hi,4,''
4,"{'id': 'WLVX8I', 'data': 'Hello', 'message_ord...",greeting,WLVX8I,Hello,1,''
5,"{'id': 'M2IRWL', 'data': 'sir', 'message_order...",whoAreYou,M2IRWL,sir,3,''
6,"{'id': '5MXRNL', 'data': 'K', 'message_order':...",whoAreYou,5MXRNL,K,2,''
7,"{'id': '609W8Y', 'data': 'On thanks', 'message...",dontMeetRequirements,609W8Y,On thanks,2,''
8,"{'id': 'T3V6IY', 'data': 'Hii', 'message_order...",dontMeetRequirements,T3V6IY,Hii,3,''
9,"{'id': 'GI07N1', 'data': 'Sir I dnt have two w...",whoAreYou,GI07N1,Sir I dnt have two wheeler,2,''


In [83]:
dataset.tail(n=10)

Unnamed: 0,data,label,id,text,message_order,comments
1990,"{'id': 'U0Z4ZF', 'data': 'Kk', 'message_order'...",notInterested,U0Z4ZF,Kk,1,''
1991,"{'id': 'CK9IU9', 'data': 'Okay', 'message_orde...",whoAreYou,CK9IU9,Okay,4,''
1992,"{'id': 'MDY1C7', 'data': 'Okay', 'message_orde...",whoAreYou,MDY1C7,Okay,0,''
1993,"{'id': 'N7Z662', 'data': 'Where work', 'messag...",greeting,N7Z662,Where work,2,''
1994,"{'id': 'O6TX8S', 'data': 'Mere paas khud ka na...",notInterested,O6TX8S,Mere paas khud ka nai hai,2,''
1995,"{'id': '2RG46Y', 'data': 'OK by', 'message_ord...",dontMeetRequirements,2RG46Y,OK by,4,''
1996,"{'id': 'HCTZ3F', 'data': 'LL', 'message_order'...",notInterested,HCTZ3F,LL,5,''
1997,"{'id': 'ITXTOW', 'data': 'Ok sir', 'message_or...",whoAreYou,ITXTOW,Ok sir,2,''
1998,"{'id': 'IOKVYD', 'data': 'Hello', 'message_ord...",greeting,IOKVYD,Hello,4,''
1999,"{'id': 'UGBYIK', 'data': 'No bike', 'message_o...",greeting,UGBYIK,No bike,0,''


##Note:- 
###The following features are not important for our model -
###1. 'id'
all the values are unique and hence won't contribute towards our model's learning.

###2. 'message_order'
couldn't really draw any conclusion over its importance in my classifier, and thus, din't use it.

###3. 'comments'
all the values are the same over the entire dataset and thus render useless for our model's learning.

In [84]:
dataset.shape  ##gives tuple of 'rows x columns' for 'dataset' dataframe.

(2000, 6)

In [85]:
dataset.describe()

Unnamed: 0,data,label,id,text,message_order,comments
count,2000,2000,2000,2000,2000,2000
unique,2000,5,1999,871,6,1
top,"{'id': 'MO79JO', 'data': 'but I have no bike',...",whoAreYou,SJZI46,Ok,0,''
freq,1,627,2,244,347,2000


In [86]:
#Finding the unique values of Intent out of the given values.

intent = dataset['label']
unique_intent = set(intent)
list(unique_intent)

['dontMeetRequirements', 'notInterested', 'location', 'whoAreYou', 'greeting']

In [87]:
sentences = dataset['text']
unique_sentences = sentences.unique()
print(len(unique_sentences))
list(unique_sentences)

871


['Good morning',
 'Location',
 'hi',
 'Hi',
 'Hello',
 'sir',
 'K',
 'On thanks',
 'Hii',
 'Sir I dnt have two wheeler',
 'Hlo',
 '\xf0\x9f\x93\xb7 Good mrng',
 'Ok',
 'Gm',
 'Yes but no bikes sorry',
 'Where it is',
 'Job placement company',
 'were',
 'Hiii',
 'No',
 'Thank you',
 'Where is the location?',
 'Nahi',
 'Marathi',
 'Iska location kya hoga',
 'Poda ...',
 'Ldhu',
 'I am not interested',
 'How r u',
 'Hi..',
 'Hiio',
 'Ok thanks \xf0\x9f\x99\x8f\xf0\x9f\x8f\xbb',
 'Learning licence hai',
 'Okkk',
 'Bhaiii',
 'Hey',
 'Msg kyu karte hai',
 'Hi sir',
 'Hi ..',
 'How are u',
 'I have learning licence',
 '\xf0\x9f\x93\xb7 G\xf0\x9f\x98\x8a\xf0\x9f\x98\x8aD Morning',
 'okay',
 'k',
 "Don't msg",
 'I have learning license',
 'Lerner license only',
 'OK bye dear t c good night',
 'Thanks',
 'Which location',
 'ohhhh sry',
 'Kidar hai job',
 'Hallo',
 'Area',
 'I have LR licence',
 'Hello Sir / Madam, sorry but I don\xe2\x80\x99t have bike \xf0\x9f\x8f\x8d',
 'No one',
 'Hy',
 'Hai 

##Building an Intent Classification Model

In [0]:
#write the 'dataset' dataframe to a CSV file for further use.
#note: it will be in the same file as the IPython notebook currently working on.

dataset.to_csv("Featured_data.csv")

In [0]:
#extract the data from the CSV file and will store it in the data frame.

def get_data():
    df = pd.read_csv("Featured_data.csv")
    return df

In [0]:
#we will prepare the data to feed it to the algorithm.

def data_prepare():
    col = ['label', 'text']
    
    #we get the complete data in ‘y’ and set the column for that data using y[col].
    y = get_data()
    y = y[col]
    
    #‘pd.notnull’ is being used for checking the data in the text column is null or not; if it’ s null the entire row is removed!
    y = y[pd.notnull(y['text'])]  #This step is important for getting the high quality clean data. Because, If we have good data we will have good results.
    y.columns = ['label', 'text']
    
    #We will make a new column ‘category_id’ which will give a number to classes. 
    #Example, say for greetings it will be 0, NotInterested 1 and so on.
    y['category_id'] = y['label'].factorize()[0]
    category_id_df = y[['label', 'category_id']].drop_duplicates().sort_values('category_id')  #removing duplicates.
    category_to_id = dict(category_id_df.values)
    id_to_category = dict(category_id_df[['category_id', 'label']].values)  #This will add the column in the dataframe.
    return y

###Using ML technique: Naive Bayes

In [0]:
#I have used the 'Multinomial Naive_Bayes' algorithm for prediction because I find it easy to implement!
#Shorcoming: since the dataset given is 'limited' Naive_bayes approach does the job effectively.
            #but for large datasets, a deep learning model will produce the desired result.
#A short overview : Here, I have divided my data into test data and train data and then feed that data into the model.
def naive_algo():
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
    df = data_prepare()
    features = tfidf.fit_transform(df.text).toarray()
    labels = df.category_id
    features.shape
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state = 0)
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    clf = MultinomialNB().fit(X_train_tfidf, y_train)
    return clf, count_vect

In [0]:
#This function will give us the final prediction.

def predict(Input):
    clf, count_vect = naive_algo()
    intent = clf.predict(count_vect.transform([Input]))
    intent = str(intent).strip("['']")
    print(intent)
    return intent

###Predictions

In [93]:
#At last, we enter the Input and pass it to the predict function.

test_input = raw_input("Enter your Input: ")  #raw_input for python2.

Enter your Input: who are you


In [94]:
x = predict(test_input)

greeting


In [0]:
intent = str(x).strip("['']")


In [96]:
print("The predicted Intent is: ")
print(intent)

The predicted Intent is: 
greeting


In [97]:
test_input_1 = raw_input("Try something different: ")

Try something different: kk


In [98]:
x_1 = predict(test_input_1)

notInterested


In [0]:
intent_1 = str(x_1).strip("['']")

In [100]:
print("The predicted Intent is: ")
print(intent_1)

The predicted Intent is: 
notInterested


#Insites of the model

###1. 
I have used the Multinomial Naive Bayes Classifier ( a simple ML algorithm for classification problem ) because of its easy implementation. And, also the given dataset in hand is relatively small to let one build a dense network model to compute preditions.

###2. 
There are warious data pre-processing methods to impact accuracy of the model, like, Tokenization, Stemming, Vectorization, etc. but to my evaluation they had no such significant impact on the model accuracy and behaviour.

###3. 
The multinomial naive bayes model applys the multi-label classification approach for the poblem. The OneVsRest strategy can be used for multi-label learning, where a classifier is used to predict multiple labels for instance. Naive Bayes supports multi-class, but I wanted a multi-label scenario, therefore, I wraped Naive Bayes in the OneVsRestClassifier.

####OneVsRest multi-label strategy
The Multi-label algorithm accepts a binary mask over multiple labels. The result for each prediction will be an array of 0s and 1s marking which class labels apply to each row input sample.

###Note:- 
I tried an approach of stemming the 'text' feature values, using 'LancesterStemmer' to better my data. But a quick reasearch made me to conclude that this has no significant ups on the model accuracy. 

#References :-

###1. https://chatbotslife.com/know-your-intent-sota-results-in-intent-classification-8e1ca47f364c

###2. https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

###3. https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

#---------------------------------------------Thank You------------------------------------