# Word Mover Implementation

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
from nltk import word_tokenize
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.similarities import WmdSimilarity
from time import time
from tqdm import tqdm
stop_words = stopwords.words('english')
warnings.filterwarnings('ignore')
root_path = '../input/data-science-for-good-careervillage/'
print('The csv files provided are:\n')
print(os.listdir(root_path))
model = KeyedVectors.load_word2vec_format('../input/word2vec-google/GoogleNews-vectors-negative300.bin', binary=True)

The csv files provided are:

['emails.csv', 'questions.csv', 'professionals.csv', 'comments.csv', 'tag_users.csv', 'group_memberships.csv', 'tags.csv', 'answer_scores.csv', 'students.csv', 'groups.csv', 'tag_questions.csv', 'question_scores.csv', 'matches.csv', 'answers.csv', 'school_memberships.csv']


In [2]:
df_emails = pd.read_csv(root_path + 'emails.csv')
df_questions = pd.read_csv(root_path + 'questions.csv')
df_professionals = pd.read_csv(root_path + 'professionals.csv')
df_comments = pd.read_csv(root_path + 'comments.csv')
df_tag_users = pd.read_csv(root_path + 'tag_users.csv')
df_group_memberships = pd.read_csv(root_path + 'group_memberships.csv')
df_tags = pd.read_csv(root_path + 'tags.csv')
df_answer_scores = pd.read_csv(root_path + 'answer_scores.csv')
df_students = pd.read_csv(root_path + 'students.csv')
df_groups = pd.read_csv(root_path + 'groups.csv')
df_tag_questions = pd.read_csv(root_path + 'tag_questions.csv')
df_question_scores = pd.read_csv(root_path + 'question_scores.csv')
df_matches = pd.read_csv(root_path + 'matches.csv')
df_answers = pd.read_csv(root_path + 'answers.csv')
df_school_memberships = pd.read_csv(root_path + 'school_memberships.csv')

## Preprocessing and computing WMD Similarity

In [3]:
def preprocess(doc):
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    doc = [w for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    return(doc)

In [4]:
clean_corpus = [] 
documents = df_questions['questions_title'].tolist()  #wmd_corpus, with no pre-processing (so we can see the original documents).
for text in tqdm(df_questions['questions_title']):
    text = preprocess(text)   
    clean_corpus.append(text)        

100%|██████████| 23931/23931 [00:04<00:00, 5115.14it/s]


In [5]:
start = time()
#We won't be training our own Word2Vec model and we'll instead use the pretrained vectors
num_best = 10
instance = WmdSimilarity(clean_corpus, model, num_best=num_best)  
print('Cell took %.2f seconds to run.' % (time() - start))

Cell took 7.79 seconds to run.


## Recommender System 

To make recommendations based on a question:

- Find questions that are similar to the one under consideration
- Find if the similar questions have been answered
- If yes, find if the professional is active. Active professional has to have answered a question within the last 1 year
- If multiple professionals fit the criteria, rank them based on the proportion of questions they have answered within 24-48 hours [since that is a key metric](https://www.kaggle.com/c/data-science-for-good-careervillage/discussion/84845#latest-496046)

In [6]:
#To see the profile of the volunteers and the questions that they have answered
df_questions['questions_date_added'] = pd.to_datetime(df_questions['questions_date_added'])
df_answers['answers_date_added'] = pd.to_datetime(df_answers['answers_date_added'])
df_answers_professionals = pd.merge(df_answers, df_professionals, left_on='answers_author_id', right_on='professionals_id', how='outer')
df_questions_answers_professionals = pd.merge(df_questions, df_answers_professionals, left_on='questions_id', right_on='answers_question_id')
df_qap_time_taken = df_questions_answers_professionals.groupby(['professionals_id','questions_id']).agg({'questions_date_added':min, 'answers_date_added':min})
df_qap_time_taken['less_than_2_days'] = df_qap_time_taken['answers_date_added'] - df_qap_time_taken['questions_date_added'] < '2 days'
df_qap_time_taken = df_qap_time_taken.reset_index().groupby('professionals_id', as_index=False).agg({'less_than_2_days':np.mean})
last_date = df_questions['questions_date_added'].max() #date of the last question asked on the platform
df_ap_grouped = df_answers_professionals.groupby('professionals_id').agg({'answers_date_added':max}).apply(lambda x:
                                                                                          (last_date-x).dt.days)
df_ap_grouped.rename(columns={'answers_date_added':'days_since_answered'}, inplace=True)
active_professionals = df_ap_grouped[df_ap_grouped['days_since_answered']<365].index

We'll give the same examples we did in the second notebook

### Example Recommendation 1

In [7]:
sent = 'Should I declare a minor during undergrad if I want to be a lawyer?'
topk = 5
query = preprocess(sent)
sims = instance[query]  #A query is simply a "look-up" in the similarity class.
#Print the query and the retrieved documents, together with their similarities.
print('Question:')
print(sent)
#We won't consider the first index since that is the question itself
for i in range(1,topk+1): 
    print('\nsim = %.4f' % sims[i][1])
    print(documents[sims[i][0]])

Question:
Should I declare a minor during undergrad if I want to be a lawyer?

sim = 0.6067
If i want to be a lawyer, what should i major in once i get into college?

sim = 0.5749
what is the best college if you want to be a lawyer in Alabama

sim = 0.5707
How do you decide what type of lawyer you want to become?

sim = 0.5665
How do I decide what I want to minor in?

sim = 0.5664
What should I study if I want to become a lawyer?


In [8]:
idx = [tup[0] for tup in sims][:5]
author_id = df_answers[df_answers['answers_question_id'].isin(df_questions.iloc[idx]['questions_id'])]['answers_author_id']
active_author_id = author_id[author_id.isin(active_professionals)]
df_recommended_pros = df_qap_time_taken[df_qap_time_taken['professionals_id'].isin(active_author_id)].sort_values('less_than_2_days', ascending=False)
print('The recommended professionals ranked by the proportion of questions answered within 48 hours:', df_recommended_pros['professionals_id'].tolist())
print('The profile of the professionals:')
df_professionals[df_professionals['professionals_id'].isin(df_recommended_pros['professionals_id'])]

The recommended professionals ranked by the proportion of questions answered within 48 hours: ['4dc61581ec7b409bbd037e483f53ba0a', 'be5d23056fcb4f1287c823beec5291e1', '209fcd55fefa4fe29ccedcdc26bd5d89', 'e1d39b665987455fbcfbec3fc6df6056', '4c1333590d234f9bb0f4cf0eac4c6efd', '43251a6879ef46dbb1a1b3399ce8229b']
The profile of the professionals:


Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined
3581,be5d23056fcb4f1287c823beec5291e1,"San Antonio, Texas",Legal Services,Employment Counselor | Open Records Specialist,2016-01-21 03:23:22 UTC+0000
5876,e1d39b665987455fbcfbec3fc6df6056,Greater Philadelphia Area,Professional Training,Industrial-Organizational Psychology & HR Cons...,2016-05-04 18:12:23 UTC+0000
13603,4c1333590d234f9bb0f4cf0eac4c6efd,United States,Insurance law,Associate General Counsel at American General ...,2017-10-11 14:42:55 UTC+0000
16582,4dc61581ec7b409bbd037e483f53ba0a,"Washington, Washington",,"PwC Experienced Associate, Advisory",2018-02-27 20:08:33 UTC+0000
17841,209fcd55fefa4fe29ccedcdc26bd5d89,"Raleigh-Durham, North Carolina Area",Management Consulting,"Vice President, Life Sciences at Treximo",2018-04-02 18:26:12 UTC+0000
25181,43251a6879ef46dbb1a1b3399ce8229b,"Atlanta, Georgia",Computer Software,Client Account Director - Enterprise Business,2018-11-13 14:16:26 UTC+0000


### Example Recommendation 2

In [9]:
sent = 'My current plan is to go to a one year film college to get a certificate in screenwriting. Many people have mentioned that you really don\'t need a film degree to get into film, so a certificate is fine. Is this true?'
topk = 5
query = preprocess(sent)
sims = instance[query]  #A query is simply a "look-up" in the similarity class.
#Print the query and the retrieved documents, together with their similarities.
print('Question:')
print(sent)
for i in range(1,topk+1):
    print('\nsim = %.4f' % sims[i][1])
    print(documents[sims[i][0]])

Question:
My current plan is to go to a one year film college to get a certificate in screenwriting. Many people have mentioned that you really don't need a film degree to get into film, so a certificate is fine. Is this true?

sim = 0.5554
I would really like to get into Video production and editing. Many people tell me I don’t need a college degree but I think that’s wrong.  Is college the right answer?

sim = 0.5514
I would like to get a certificate in the film Making (online)

sim = 0.5297
Should I get a film degree or find an film internship?

sim = 0.5296
Do you get residency in the state you go to college in after one year?

sim = 0.5285
How hard is it to get one of your own projects up and running in the Film industry ?


In [10]:
idx = [tup[0] for tup in sims][:5]
author_id = df_answers[df_answers['answers_question_id'].isin(df_questions.iloc[idx]['questions_id'])]['answers_author_id']
active_author_id = author_id[author_id.isin(active_professionals)]
df_recommended_pros = df_qap_time_taken[df_qap_time_taken['professionals_id'].isin(active_author_id)].sort_values('less_than_2_days', ascending=False)
print('The recommended professionals ranked by the proportion of questions answered within 48 hours:', df_recommended_pros['professionals_id'].tolist())
print('The profile of the professionals:')
df_professionals[df_professionals['professionals_id'].isin(df_recommended_pros['professionals_id'])]

The recommended professionals ranked by the proportion of questions answered within 48 hours: ['bf89b0d1af0f49468ad3e047fe71013f', 'e27c43e8671242e1bfb80829744ee3ad', '9a5aead62c344207b2624dba90985dc5', '460fd56dcb694172bce9ed71f48af804', '7dbe5b68c32b4e308a9a2b191a73f0c7']
The profile of the professionals:


Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined
10284,460fd56dcb694172bce9ed71f48af804,"Atlanta, Georgia",Broadcast Media,Video Production Manager at Coca-Cola Studios,2017-04-24 18:53:59 UTC+0000
17758,7dbe5b68c32b4e308a9a2b191a73f0c7,"Los Angeles, California",Entertainment,Director of Photography at Freelance,2018-03-30 17:46:39 UTC+0000
24392,bf89b0d1af0f49468ad3e047fe71013f,"Jefferson, Iowa",,Graduate Programs Assistant,2018-10-19 15:52:03 UTC+0000
24748,e27c43e8671242e1bfb80829744ee3ad,"Chicago, Illinois",Entertainment,Theatre Professional and Consultant,2018-10-30 18:11:23 UTC+0000
25379,9a5aead62c344207b2624dba90985dc5,"Newark, New Jersey",Education,Either fall or grow!,2018-11-15 19:16:05 UTC+0000


### Active-Passive Indifference 

One particular advantage of WMD is that it doesn't matter if the question is in the active or the passive voice

In [11]:
sent = 'If I want to be a lawyer, should I declare a minor during undergrad?' 
query = preprocess(sent)
sims = instance[query]  #A query is simply a "look-up" in the similarity class.
#Print the query and the retrieved documents, together with their similarities.
print('Question:')
print(sent)
for i in range(0,topk+1): 
    print('\nsim = %.4f' % sims[i][1])
    print(documents[sims[i][0]])

Question:
If I want to be a lawyer, should I declare a minor during undergrad?

sim = 1.0000
Should I declare a minor during undergrad if I want to be a lawyer?

sim = 0.6067
If i want to be a lawyer, what should i major in once i get into college?

sim = 0.5749
what is the best college if you want to be a lawyer in Alabama

sim = 0.5707
How do you decide what type of lawyer you want to become?

sim = 0.5665
How do I decide what I want to minor in?

sim = 0.5664
What should I study if I want to become a lawyer?


The most similar question is the first example we used for demonstration purposes. Thus WMD doesn't care about the order in which words appear in the two strings.