# The Task

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Based on a user query, return an existing question which most closely resembles the user's query

In [2]:
# import libraries
import os
import numpy as np
import pandas as pd
import tensorflow as tf
# import tensorflow_hub as hub
from nltk import RegexpTokenizer
from sklearn import preprocessing

import spacy
from spacy.lang.en import English
import spacy
EN = spacy.load('en_core_web_sm')

from IPython.display import HTML, display
import logging
logging.getLogger('tensorflow').disabled = True #OPTIONAL - to disable outputs from Tensorflow

## Import the Data

In [3]:
data = pd.read_csv('/content/gdrive/My Drive/Stackoverflow_VS_extension/Preprocessed_data.csv')
data

Unnamed: 0,original_title,post_corpus,question_content,question_url,tags,overall_scores,answers_content,sentiment_polarity,sentiment_subjectivity,processed_title
0,Using 'in' to match an attribute of Python obj...,using match attribute python objects array nt ...,using match attribute python objects array nt ...,https://stackoverflow.com/questions/683,python|arrays|iteration,0.011301,Using a list comprehension would build a tempo...,0.163567,0.568209,using match attribute python objects array
1,Python version of PHP's stripslashes,python version php stripslashes wrote piece co...,python version php stripslashes wrote piece co...,https://stackoverflow.com/questions/13454,python|string|escaping,0.001115,Python has a built-in escape() function analog...,0.195000,0.519274,python version php stripslashes
2,Unicode vs UTF-8 confusion in Python / Django?,unicode vs utf8 confusion python django stumbl...,unicode vs utf8 confusion python django stumbl...,https://stackoverflow.com/questions/22149,python|django|unicode,0.006997,From Wikipedia on UTF-8:,0.082857,0.403250,unicode vs utf8 confusion python django
3,Using Django time/date widgets in custom form,using django time date widgets custom form use...,using django time date widgets custom form use...,https://stackoverflow.com/questions/38601,python|django,0.041431,"Starting in Django 1.2 RC1, if you're using th...",0.302423,0.599938,using django time date widgets custom form
4,Can parallel traversals be done in MATLAB just...,parallel traversals done matlab python using f...,parallel traversals done matlab python using f...,https://stackoverflow.com/questions/49307,python|arrays|matlab|for-loop,0.002837,should be for example:,0.358333,0.752381,parallel traversals done matlab python
...,...,...,...,...,...,...,...,...,...,...
147054,How can I insert spaces between words given a ...,insert spaces words given list lists coded let...,insert spaces words given list lists coded let...,https://stackoverflow.com/questions/63758260,python|list|dictionary|spacing,-0.000607,Just append another whitespace in array:,0.243050,0.873782,insert spaces words given list lists coded let...
147055,Django creates another media folder inside med...,django creates another media folder inside med...,django creates another media folder inside med...,https://stackoverflow.com/questions/63758482,python|django|python-imaging-library,-0.000176,The parameter [Django-doc] is relative to the...,0.055556,0.444444,django creates another media folder inside med...
147056,Options for deploying Flask app that continuou...,options deploying flask app continuously web s...,options deploying flask app continuously web s...,https://stackoverflow.com/questions/63758866,python|flask|heroku|web-scraping|web-applications,-0.000607,Have you tried using Cron? There is no cost no...,0.044898,0.545003,options deploying flask app continuously web s...
147057,"Delete ""nan"" in python list",delete nan python list new python simple quest...,delete nan python list new python simple quest...,https://stackoverflow.com/questions/63758902,python|list,-0.000750,You can ... get creative:my_list = ['experienc...,0.138095,0.636310,delete nan python list


## Import saved WordEmbeddings

In [4]:
# Import saved Wordvec Embeddings
import gensim
w2v_model = gensim.models.word2vec.Word2Vec.load('/content/gdrive/My Drive/Stackoverflow_VS_extension/SO_word2vec_embeddings.bin')

## Calculate Sentence Embeddings
In order to calculate the embeddings for an entire sentence, I defined the following function which averages the the embeddings for each valid token 

In [5]:
def question_to_vec(question, embeddings, dim=300):
    question_embedding = np.zeros(dim)
    valid_words = 0
    for word in question.split(' '):
        if word in embeddings:
            valid_words += 1
            question_embedding += embeddings[word]
    if valid_words > 0:
        return question_embedding/valid_words
    else:
        return question_embedding

In [6]:
all_title_embeddings = []
for title in data.processed_title:
    all_title_embeddings.append(question_to_vec(title, w2v_model))
all_title_embeddings = np.array(all_title_embeddings)

embeddings = pd.DataFrame(data = all_title_embeddings)
embeddings.to_csv('/content/gdrive/My Drive/Stackoverflow_VS_extension/title_embeddings.csv', index=False)


  """
  import sys


Since the number of titles have have is fixed, I saved the sentence embeddings for all titles in a .csv file to save computation time on future runs 

In [7]:
all_title_embeddings = pd.read_csv('/content/gdrive/My Drive/Stackoverflow_VS_extension/title_embeddings.csv').values

## Import the saved model

In [8]:
import keras.backend as K

# Custom loss function to handle multilabel classification task
def multitask_loss(y_true, y_pred):
    # Avoid divide by 0
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    # Multi-task loss
    return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1))

In [17]:
# loading tokenizer
import pickle
with open('/content/gdrive/My Drive/Stackoverflow_VS_extension/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens.' % len(word_index))
W2V_SIZE = 300
MAX_SEQUENCE_LENGTH = 300

# Embedding matrix for the embedding layer
embedding_matrix = np.zeros((vocab_size+1, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

Found 469467 unique tokens.
(469468, 300)


In [18]:
# from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout,Conv1D,GlobalMaxPool1D,GRU
from keras.layers.normalization import BatchNormalization
from datetime import datetime
from keras.callbacks import ReduceLROnPlateau, EarlyStopping,TensorBoard,ModelCheckpoint
from time import time

model = Sequential()
model.add(Embedding(vocab_size+1, W2V_SIZE, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(GRU(300, activation='relu',kernel_initializer='he_normal'))
model.add(Dense(400,activation='relu',kernel_initializer="he_normal"))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(150,activation='relu'))
model.add(Dense(500, activation='sigmoid'))
model.summary()

model.compile(loss=multitask_loss,
              optimizer="adam")

from tensorflow.keras.models import load_model
import keras.losses

model.load_weights('/content/gdrive/My Drive/Stackoverflow_VS_extension/Tag_predictor_weights.h5')

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          140840400 
_________________________________________________________________
gru (GRU)                    (None, 300)               541800    
_________________________________________________________________
dense (Dense)                (None, 400)               120400    
_________________________________________________________________
dropout (Dropout)            (None, 400)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 400)               1600      
_________________________________________________________________
dense_1 (Dense)              (None, 150)               60150     
_________________________________________________________________
dense_2 (Dense)              (None, 500)              

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer
tag_encoder = MultiLabelBinarizer()
def predict_tags(text, include_neutral=True):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
    # Predict
    prediction = model.predict([x_test])[0]
    for i,value in enumerate(prediction):
        if value > 0.5:
            prediction[i] = 1
        else:
            prediction[i] = 0
    tags = tag_encoder.inverse_transform(np.array([prediction]))
    return tags

In [20]:
import re
import nltk
# import inflect
from nltk.corpus import stopwords

def tokenize_text(text):
    "Apply tokenization using spacy to docstrings."
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def tokenize_code(text):
    "A very basic procedure for tokenizing code strings."
    return RegexpTokenizer(r'\w+').tokenize(text)

def preprocess_text(text):
    return ' '.join(normalize(tokenize_text(text)))

## Import the saved Tokenizer

In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 300
import pickle
with open('/content/gdrive/My Drive/Stackoverflow_VS_extension/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Getting the most similar results
So the way we actually calculate the the most similar results, is by comparing how far each result is from the query in terms of distance. This can only be done if both the query and the results are in a shared vector space. **Fortunately, that is exactly what our word embeddings are for**. They create each sentence as a vector in the embedding space, which makes it easy for us to distinguish them. 

After we have those vectors, we can assign a **Similarity Measure** as a metric which measures the closeness of two vectors. Common examples are Cosine distance, Euclidean distance and more.

**However, for this specific task, I decided to assign a custom similarity measure**. It is defined as follows:

![Similarity Measure](jupyter_imgs/similaritymeasure.png)

- It considers the cosine distance as a base measure
- It takes into account the popularity of the post based on the votes it has received by users at StackOverflow
- It takes into account the overall sentiment of the responses that people have made. A positive sentiment entails that the answers were helpful and thus is a good post 

In [24]:
from IPython.display import HTML
import logging
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
search_string = "Combine lists of lists" 
search_string = ' '.join(normalize(tokenize_text(search_string)))
results_returned = "5" 
search_vect = np.array([question_to_vec(search_string, w2v_model)])    # Vectorize the user query

# Calculate Cosine similarites for the query and all titles
cosine_similarities = pd.Series(cosine_similarity(search_vect, all_title_embeddings)[0])

# Custom Similarity Measure
cosine_similarities = cosine_similarities*(1 + 0.4*data.overall_scores + 0.1*(data.sentiment_polarity))

output =""
for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
    output += '<a target="_blank" href='+ str(data.question_url[i])+'><h2>' + data.original_title[i] + '</h2></a>'
    output += '<h3> Similarity Score: ' + str(j) + '</h3>'
    output += '<h3> Stackover Votes: ' + str(data.overall_scores[i]) + '</h3>'
    output +='<p style="font-family:verdana; font-size:110%;"> '
    for i in data.question_content[i][:50].split():
        if i.lower() in search_string:
            output += " <b>"+str(i)+"</b>"
        else:
            output += " "+str(i)
    output += "</p><hr>"
    
output = '<h3>Results:</h3>'+output
display(HTML(output))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  """
  import sys
