# Building classifier models for Chatbot

## 1. Importing the required libraries

In [1]:
# Data manipulation libraries
import re
import time
import pickle
import numpy as np
import pandas as pd

# Text Processing libraries
import nltk
import gensim
import gensim.downloader as api
from nltk.corpus import stopwords

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances_argmin

## 2. Reading the Data

In [2]:
# Loading the dialogues dataset
dialogues = pd.read_csv("dialogues.tsv",sep="\t")

In [3]:
# Loading the posts dataset
posts = pd.read_csv("tagged_posts.tsv",sep="\t")

In [4]:
# Inspecting the dialogues dataset
dialogues.head()

Unnamed: 0,text,tag
0,Okay -- you're gonna need to learn how to lie.,dialogue
1,I'm kidding. You know how sometimes you just ...,dialogue
2,Like my fear of wearing pastels?,dialogue
3,I figured you'd get to the good stuff eventually.,dialogue
4,Thank God! If I had to hear one more story ab...,dialogue


In [5]:
# Inspecting the posts dataset
posts.head()

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#
2,39,Reliable timer in a console application,c#
3,42,Best way to allow plugins for a PHP application,php
4,59,"How do I get a distinct, ordered list of names...",c#


In [6]:
# Checking the no of entries in each of the dataset
print("Number of Posts:",len(posts))
print("Number of Dialogues:",len(dialogues))

Number of Posts: 2171575
Number of Dialogues: 218609


## 3. Creating training data for intent classifier

In [7]:
# Fetching the first 0.2 million rows from either of the dataset
texts  =  list(dialogues[:200000].text.values) + list(posts[:200000].title.values)
labels =  ['dialogue']*200000 + ['stackoverflow']*200000

In [8]:
# Creating a dataframe with texts and labels as columns
data = pd.DataFrame({'text':texts,'target':labels})

### Creating a function which performs basic text preprocessing tasks

**The tasks are as follows :-**

a) Replace the characterts such as [/(){}\[\]\|@,;] with space.  
b) Keep only the alphanumeric ones (and some special ones).  
c) Removing the english stopwords as these do not serve any purpose.  

In [9]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [10]:
# Performing data cleaning task on each of the text present in the data 
current_time = time.time()
data['text'] = data['text'].apply(lambda x : text_prepare(x))
print('Time Taken (in sec) to process the above operation : {}'.format(time.time() - current_time))

Time Taken (in sec) to process the above operation : 298.77839374542236


In [11]:
# Splitting the dataset into training and testing for model building and prediction validation purposes

X_train, X_test, y_train, y_test = train_test_split(data['text'],data['target'],test_size = .1 , random_state=0)

# Printing the sizes of training and testing dataset
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

Train size = 360000, test size = 40000


## 4. Creating an Intent classifier 

### Intent classifier classifies the question in two categories namely :- Dialogue or Stack Overflow

In [14]:
# Making a new directory named as resources to keep our models and vectorizers
!mkdir resources

### Creating a function which returns the TF-IDF matrix for each of the training and testing datasets.

TF-IDF Matrix stands for Term Frequency and the Inverse Document frequency matrix . TF is described as the no of occurences of a word in a document  whereas IDF is the inverse of the document frequency where document frequency is the no of times the word has occured across all the documents divided by the no of documents.TF-IDF is the product of TF and IDF reflecting each term's presumed importance.

In [15]:
def tfidf_features(X_train, X_test, vectorizer_path):
    """Performs TF-IDF transformation and dumps the model."""
    tfv = TfidfVectorizer(dtype=np.float32, min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    
    X_train = tfv.fit_transform(X_train)
    X_test = tfv.transform(X_test)
    
    pickle.dump(tfv,vectorizer_path)
    return X_train, X_test

In [17]:
# Splitting the TF-IDF matrix into training and testing for model building and prediction validation purposes
current_time = time.time()
X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, open("resources/tfidf.pkl",'wb'))
print('Time Taken (in sec) to process the above operation : {}'.format(time.time() - current_time))

Time Taken (in sec) to process the above operation : 37.037238359451294


In [18]:
# Building a Logistic Regression model to classify the intent of the question (Dialogue or StackOverflow)
import warnings
warnings.filterwarnings('ignore')
intent_recognizer = LogisticRegression(C=10,random_state=0)
intent_recognizer.fit(X_train_tfidf,y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
# Checking test accuracy
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.989625


In [20]:
y_test_pred

array(['stackoverflow', 'dialogue', 'stackoverflow', ..., 'dialogue',
       'stackoverflow', 'stackoverflow'], dtype=object)

In [21]:
# Dumping the intent recognizer model in the resources directory for future use
pickle.dump(intent_recognizer, open("resources/intent_clf.pkl" , 'wb'))

## 5. Creating a Programming Language Classifier

### Classifier classifies the question into one of the below programming language based on the question's text.

**c# ,php , c_cpp , python , ruby , java , javascript , vb , r , swift**

In [22]:
# Identifying unique programming languages
posts['tag'].unique()

array(['c#', 'php', 'c_cpp', 'python', 'ruby', 'java', 'javascript', 'vb',
       'r', 'swift'], dtype=object)

In [23]:
# Performing data cleaning on each of the title present in the data
X = posts['title'].apply(lambda x : text_prepare(x))
y = posts['tag'].values

In [24]:
# Splitting the dataset into training and testing for model building and prediction validation purposes

X_train_plc, X_test_plc, y_train_plc, y_test_plc = train_test_split(X, y, test_size=0.2, random_state=0)

# Printing the sizes of training and testing dataset
print('Train size = {}, test size = {}'.format(len(X_train_plc), len(X_test_plc)))

Train size = 1737260, test size = 434315


In [25]:
# Loading the stored TF_IDF_vectorizer object to build a new TF-IDF matrix
current_time = time.time()
vectorizer = pickle.load(open("resources/tfidf.pkl", 'rb'))
X_train_plc_tfidf, X_test_plc_tfidf = vectorizer.transform(X_train_plc), vectorizer.transform(X_test_plc)
print('Time Taken (in sec) to process the above operation : {}'.format(time.time() - current_time))

Time Taken (in sec) to process the above operation : 98.4446370601654


In [26]:
# Building a Logistic Regression model to classify the question into one of the programming language(for StackOverflow only)
current_time = time.time()
tag_classifier = OneVsRestClassifier(LogisticRegression(C=5 , random_state=0 , max_iter = 100))
tag_classifier.fit(X_train_plc_tfidf,y_train_plc)
print('Time Taken (in sec) to process the above operation : {}'.format(time.time() - current_time))

Time Taken (in sec) to process the above operation : 566.328616142273


In [28]:
# Checking test accuracy
y_test_plc_pred = tag_classifier.predict(X_test_plc_tfidf)
test_accuracy = accuracy_score(y_test_plc, y_test_plc_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.8043079331821373


In [30]:
# Dumping the tag classifier model in the resources directory for future use
pickle.dump(tag_classifier, open("resources/tag_clf.pkl", 'wb'))

# 6. Building and Storing title database embeddings/word vectors

Here we are intending to build word vectors corresponding to each of the titles using the [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google.  

Word Vectors are basically representation of words in a high dimensional space here the no of dimensions being 300 for a word.  

Word Vectors are useful since it enables us to know how similar two sentences are.

In [31]:
# Loading the google's pre trained Word2 Vec model.
current_time = time.time()
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) 
print(time.time() - current_time)

263.8938066959381


### Building a function which will convert the input text to an embedding. 

These embeddings will be used to give an answer to user's stack overflow question by getting the most similar question using the cosine similarity metric .

In [34]:
def question_to_vec(question, embeddings, dim=300):
    """
        question: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the question
    """
    word_tokens = question.split(" ")
    question_len = len(word_tokens)
    
    # Initializing an array with the rows as no of words in the text and 300 as the columns
    question_mat = np.zeros((question_len,dim), dtype = np.float32)
    
    # Checking if the word exists in the google pre-trained model 
    for idx, word in enumerate(word_tokens):
        if word in embeddings:
            # If word is there then populate the array with the embeddings of the word 
            question_mat[idx,:] = embeddings[word]
            
    # remove zero-rows which stand for OOV words       
    question_mat = question_mat[~np.all(question_mat == 0, axis = 1)]
    
    # Compute the mean of each word along the sentence
    if question_mat.shape[0] > 0:
        vec = np.array(np.mean(question_mat, axis = 0), dtype = np.float32).reshape((1,dim))
    else:
        vec = np.zeros((1,dim), dtype = np.float32)
        
    return vec

In [35]:
# Making a new directory named as resources
!mkdir embeddings_folder

In [36]:
# Populating an array with the title's embeddings 
current_time = time.time()

counts_by_tag = posts.groupby(by=['tag'])["tag"].count().reset_index(name = 'count').sort_values(['count'], ascending = False)

# Creating a list of tupes where first element of tuple is the tag an second is the count
counts_by_tag = list(zip(counts_by_tag['tag'],counts_by_tag['count']))

# Iterating over each tag
for tag, count in counts_by_tag:
    tag_posts = posts[posts['tag'] == tag]
    tag_post_ids = tag_posts['post_id'].values
    
    # Intializng an array with the rows as titles belonging to a tag and columns as 300 
    tag_vectors = np.zeros((count, 300), dtype=np.float32)
    
    # Ierating over each title belonging to a tag
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title, model, 300)
        
    # Dump post ids and vectors to a file.
    filename = 'resources/embeddings_folder/'+ tag + '.pkl'
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))
print('Time Taken (in sec) to process the above operation : {}'.format(time.time() - current_time))

Time Taken (in sec) to process the above operation : 578.8078963756561


## Building a function which when given a question and tag will retrieve the most similar question post_id

In [37]:
def get_similar_question(question,tag):
    current_time = time.time()
    # Get the path where all the question embeddings are kept and also loading the post_ids and title_embeddings
    embeddings_path = 'resources/embeddings_folder/' + tag + ".pkl"
    post_ids, post_embeddings = pickle.load(open(embeddings_path, 'rb'))
    # Get the embeddings for the question
    question_vec = question_to_vec(question, model, 300)
    # Find index of most similar post
    best_post_index = pairwise_distances_argmin(question_vec,post_embeddings)
    print('Time Taken (in sec) to process the above operation : {}'.format(time.time() - current_time))
    # Return best post id
    return post_ids[best_post_index]

answer_post_id =get_similar_question("how to use list comprehension in python?",'python')[0]

print("We can find the answer to the question at : https://stackoverflow.com/questions/{}".format(answer_post_id))

Time Taken (in sec) to process the above operation : 2.009498357772827
We can find the answer to the question at : https://stackoverflow.com/questions/5947137
