**Question Text Embedding with word vectors learnt from model**

Question text = Question title + Question Body + all its Answers Body


TF-IDF weighted word to vector embedding is used for entire Question Text

In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import OrderedDict
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Get preprocessed Question data from saved pickle file
question_data = pd.read_pickle("/content/drive/MyDrive/StackOverflow_CaseStudy/Preprocessed_data/W2V_Model_Preprocessed_selected_data.pkl")

In [None]:
question_data.columns

Index(['Id', 'Score', 'Title', 'Body', 'Processed_Ques_Title',
       'Processed_Ques_Body', 'UNIX_CreationDate', 'SummedScore',
       'Processed_Ans_Body', 'Ques_Text'],
      dtype='object')

In [None]:
questions = question_data['Ques_Text'].values

In [None]:
questions_df = question_data[['Id','Title','Ques_Text']]
questions_df

Unnamed: 0,Id,Title,Ques_Text
4,260,Adding scripting functionality to .NET applica...,adding scripting functionality net application...
8,650,Automatically update version number,automatically update version number would like...
10,930,How do I connect to a database and loop over a...,connect database loop recordset c simplest way...
11,1010,"How to get the value of built, encoded ViewState?",get value built encoded viewstate need grab ba...
12,1040,How do I delete a file which is locked by anot...,delete file locked another process c looking w...
...,...,...,...
1102549,40141980,How to convert a txt stream web request contai...,convert txt stream web request containing json...
1102550,40142000,c# outlook addin xml dynamic menu not populating,c outlook addin xml dynamic menu not populatin...
1102555,40142220,Setting up Derby on MacOS Sierra,setting derby macos sierra following java prog...
1102559,40142450,C# thread report found items,c thread report found items lets say want make...


**Question Embedding using Word2Vec model's word vectors**

In [None]:
#Get vocabulary vectors learnt from Word2Vec model
word2vec_vectors = np.array(pd.read_csv('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/vectors.tsv',
                           sep = '\t', header=None))

word2vec_vocab = pd.read_csv('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/metadata.tsv',
                           sep = '\t', header=None)

word2vec_vocab = word2vec_vocab[0].values

word2vec_dict = dict(zip(word2vec_vocab, word2vec_vectors))

In [None]:
#TF-IDF vectorization on Question corpus to get word idf values
ques_tfidf = TfidfVectorizer(min_df=10)
ques_tfidf.fit(questions)


TfidfVectorizer(min_df=10)

In [None]:
#save the tf-idf vectorization
pickle.dump(ques_tfidf, open("/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfidf_model_ques.pickle", "wb"))

In [None]:
ques_tfidf = pd.read_pickle("/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfidf_model_ques.pickle")

In [None]:
# we are converting a dictionary with word as a key, and the idf as a value
idf_dict = dict(zip(ques_tfidf.get_feature_names_out(), list(ques_tfidf.idf_)))
tfidf_vocab = set(ques_tfidf.get_feature_names_out())

Create tf-idf based embeding for each Question

In [None]:
#performing this step in batches to avoid running out of memory

word_embedding_size=128
no_of_batches=5
batch_size = int(len(questions)/no_of_batches)
for i in range(0, no_of_batches):
  questions_tfidf_w2vec = np.zeros(shape=(batch_size, word_embedding_size), dtype = np.float32) # the tfidf-w2v for each question is stored in this array
  for j, ques in enumerate(tqdm(questions[i*batch_size : (i+1)*batch_size])): # for each question
      #get vector for each word in Question Text
      vector = np.array([word2vec_dict[w] for w in ques.split() if w in word2vec_vocab and w in tfidf_vocab ])
      #get tf-idf value for each word in Question text
      tf_idf = np.array([(idf_dict[w]*(ques.count(w)/len(ques.split()))) for w in ques.split() if w in word2vec_vocab and w in tfidf_vocab ])
      #tf-idf weighted Word2Vec
      tf_idf_vec = np.sum(vector*tf_idf[:,None], axis=0)
      if(np.sum(tf_idf)!=0):
        tf_idf_vec = tf_idf_vec/np.sum(tf_idf)
      if(len(vector)>0):
        #add to the array
        questions_tfidf_w2vec[j] = tf_idf_vec
  np.save('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdfW2VQues_'+str(i)+'_new.npy', questions_tfidf_w2vec)

  0%|          | 0/59219 [00:00<?, ?it/s]

In [None]:
word_embedding_size=128
questions_tfidf_w2vec = np.zeros(shape=(4, word_embedding_size), dtype = np.float32) # the tfidf-w2v for each question is stored in this list
for j, ques in enumerate(tqdm(questions[296095 : 296099])): # for last 4 questions
    vector = np.array([word2vec_dict[w] for w in ques.split() if w in word2vec_vocab and w in tfidf_vocab ])
    tf_idf = np.array([(idf_dict[w]*(ques.count(w)/len(ques.split()))) for w in ques.split() if w in word2vec_vocab and w in tfidf_vocab ])
    tf_idf_vec = np.sum(vector*tf_idf[:,None], axis=0)
    if(np.sum(tf_idf)!=0):
      tf_idf_vec = tf_idf_vec/np.sum(tf_idf)
    if(len(vector)>0):
      questions_tfidf_w2vec[j] = tf_idf_vec

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
#get all the saved tf-idf wighted Word2Vec Questions
w2vemb_1 = np.load('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdfW2VQues_0_new.npy')
w2vemb_2 = np.load('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdfW2VQues_1_new.npy')
w2vemb_3 = np.load('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdfW2VQues_2_new.npy')
w2vemb_4 = np.load('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdfW2VQues_3_new.npy')
w2vemb_5 = np.load('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdfW2VQues_4_new.npy')

In [None]:
print(w2vemb_1.shape)
print(w2vemb_2.shape)
print(w2vemb_3.shape)
print(w2vemb_4.shape)
print(w2vemb_5.shape)

(59219, 128)
(59219, 128)
(59219, 128)
(59219, 128)
(59219, 128)


In [None]:
#combine all into single array
word2vec_ques_embeddings = np.vstack((w2vemb_1, w2vemb_2, w2vemb_3, w2vemb_4, w2vemb_5, questions_tfidf_w2vec))
word2vec_ques_embeddings.shape

(296099, 128)

In [None]:
#save the embedded questions data
np.save('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/tfIdf_Wtd_W2V_QuestionEmbeddings.npy', word2vec_ques_embeddings)

**Question Embedding using LSTM model based word vectors**

Since using a for loop to create tf-idf weighted embedding for question(as doneabove for Word2Vec) was very time intensive, using pandas' apply function here with some temporary columns to create embedding. This process was much much faster than the above for loop.

In [None]:
#get lstm based vocab-vector dictionary
lstm_vocab_vector_dict = pickle.load(open("/content/drive/MyDrive/StackOverflow_CaseStudy/Model2_data/Best_LSTM_Model_Vocab_Vector_dict.pkl", "rb"))

In [None]:
#get vocab
lstm_vocab = lstm_vocab_vector_dict.keys()

In [None]:
lstm_all_words = set(lstm_vocab)

In [None]:
#https://stackoverflow.com/questions/70967869/keep-only-matched-words-in-pandas-column
#create set of all words in a question
questions_df['ques_words'] = questions_df['Ques_Text'].apply(lambda x: set(x.split()))

In [None]:
#keep the words which exist in our vector vocabulary
questions_df['ques_words_with_vector'] = questions_df['ques_words'].apply(lambda x: x.intersection(lstm_all_words))

In [None]:
#further filter the words which exist in tf-idf vocab also
questions_df['ques_words_with_vector_and_tfidf'] = questions_df['ques_words_with_vector'].apply(lambda x: x.intersection(tfidf_vocab))

In [None]:
#find the words which are not in either vector of tf-idf vocabulary and this need to be removed from question text
questions_df['ques_words_to_remove'] = questions_df.apply(lambda x: list(x['ques_words'].difference(x['ques_words_with_vector_and_tfidf'])), axis=1)

In [None]:
#remove the above found words from question text
questions_df['Ques_Text_embedded_words'] = questions_df.apply(lambda x: ' '.join([word for word in x['Ques_Text'].split() if word not in (x['ques_words_to_remove'])]), axis=1)


In [None]:
#get list of all words from question text to be embeddded
questions_df['ques_all_words_to_embed_lstm'] = questions_df['Ques_Text_embedded_words'].apply(lambda x: x.split())

In [None]:
#get vectors for each word to be embedded in question
questions_df['Ques_Text_embedded_lstm_vector'] = questions_df['ques_all_words_to_embed_lstm'].apply(lambda x: list(lstm_vocab_vector_dict[word] for word in x))


In [None]:
#get tf-idf value for each word to be embedded in question
questions_df['Ques_tfidf_val_words_to_embed_lstm'] = questions_df.apply(lambda x: list(idf_dict[word]* x['ques_all_words_to_embed_lstm'].count(word)/len(x['ques_all_words_to_embed_lstm']) for word in x['ques_all_words_to_embed_lstm']), axis=1)


In [None]:
#get tf-idf weighted vector embedding for entire question text
questions_df['tfidf_wtd_lstm_embed_questions'] = questions_df.apply(lambda x: np.sum(np.array(x['Ques_Text_embedded_lstm_vector'])*np.array(x['Ques_tfidf_val_words_to_embed_lstm'])[:,None], axis=0)/np.sum(np.array(x['Ques_tfidf_val_words_to_embed_lstm'])), axis=1)


In [None]:
#remove the temporary created columns fro dataframe
questions_df = questions_df.drop(['ques_words', 'ques_words_with_vector','ques_words_with_vector_and_tfidf','ques_words_to_remove', 'Ques_Text_embedded_words', 'ques_all_words_to_embed_lstm','Ques_Text_embedded_lstm_vector','Ques_tfidf_val_words_to_embed_lstm'], axis=1)

In [None]:
#save final embeddings to a pickle file
questions_df.to_pickle("/content/drive/MyDrive/StackOverflow_CaseStudy/Questions_lstm_embeddings_dataset.pkl")