# Week 4 - Word Embeddings Supplemental

This notebook contains two additional uses for word embeddings

For this notebook we will be using the following packages

In [27]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud #pip install -U git+git://github.com/Computational-Content-Analysis-2018/lucem_illud.git

#All these packages need to be installed from pip
import gensim#For word2vec, etc
import requests #For downloading our datasets
import nltk #For stop words and stemmers
import numpy as np #For arrays
import pandas #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer
import sklearn.metrics.pairwise #For cosine similarity
import sklearn.manifold #For T-SNE
import sklearn.decomposition #For PCA
import copy

#gensim uses a couple of deprecated features
#we can't do anything about them so lets ignore them 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

import os #For looking through files
import os.path #For managing file paths

In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import docx 

import re 
import urllib.parse 
import io 
import json
import os.path
import os 
import time
import nltk 
import numpy as np
import jieba
import jieba.posseg as pseg

import _pickle as cPickle

import sklearn
import sklearn.feature_extraction.text
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.datasets
import sklearn.cluster
import sklearn.decomposition
import sklearn.metrics

import scipy 
import scipy.cluster.hierarchy
import gensim

import matplotlib.pyplot as plt #For graphics
import matplotlib.cm #Still for graphics
import seaborn as sns #Makes the graphics look nicer

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning, it
%matplotlib inline

import itertools
import json

# The Score Function

The score function is a simple calculation developed by [Matt Taddy](https://arxiv.org/pdf/1504.07295.pdf) to calculate the likelihood that a given text would have been generated by a word-embedding model by summing the inner product between each pair of the text's word vectors. 

Here, we explore this using a model trained with millions of resumes from the CareerBuilder website (we can't share the private resumes...but we can share a model built with them :-):

In [25]:
resume_model  = gensim.models.word2vec.Word2Vec.load('../data/resumeAll.model')

In [26]:
type(resume_model)

gensim.models.word2vec.Word2Vec

We can examine the vacabularies of this model by building a word-index map:

In [31]:
vocab = resume_model.index2word


Let's just load the sample and take a look at it. The sentences in each job description are already tokenized and normalized.

In [13]:
sampleDF = pandas.read_csv('../data/SampleJobAds.csv', index_col = False)
#We need to convert the last couple columns from strings to lists
sampleDF['tokenized_sents'] = sampleDF['tokenized_sents'].apply(lambda x: eval(x))
sampleDF['normalized_sents'] = sampleDF['normalized_sents'].apply(lambda x: eval(x))
sampleDF

Unnamed: 0.1,Unnamed: 0,hiringOrganization_organizationName,jobDescription,jobLocation_address_region,jobLocation_geo_latitude,jobLocation_geo_longitude,qualifications,responsibilities,tokenized_sents,normalized_sents
0,158844,"Golfsmith International, Inc.","""Sales Associate Tracking Code 220425-971 Job ...",California,33.91918,-118.41647,,"""Ensure each Customer receives exceptional ser...","[[``, Sales, Associate, Tracking, Code, 220425...","[[sales, associate, tracking, code, job, descr..."
1,257645,Intel,For PHY system engineering team within the Wir...,,,,,,"[[For, PHY, system, engineering, team, within,...","[[for, phy, system, engineering, team, within,..."
2,107875,Florida Hospital,*RN Medical Oncology PCU Orlando - Nights* Flo...,Florida,28.53834,-81.37924,,,"[[*RN, Medical, Oncology, PCU, Orlando, -, Nig...","[[medical, oncology, pcu, orlando, florida, ho..."
3,202394,Hitachi Data Systems,Title: Specialist Sales Account Representative...,,,,,,"[[Title, :, Specialist, Sales, Account, Repres...","[[title, specialist, sales, account, represent..."
4,109675,Footprint Retail Services,**Footprint Retail Services** **Job Descriptio...,,,,,A Merchandiser must complete all assigned merc...,"[[**Footprint, Retail, Services**, **Job, Desc...","[[retail, job, title, retail, merchandiser, re..."
5,215973,Home Depot,Position Purpose: Provide outstanding service ...,Indiana,41.13060,-85.12886,,Provide outstanding service to ensure efficien...,"[[Position, Purpose, :, Provide, outstanding, ...","[[position, purpose, provide, outstanding, ser..."
6,207524,Home Depot,The Asset Protection Specialist is primarily r...,New Jersey,40.21455,-74.61932,Must be eighteen years of age or older. Must p...,,"[[The, Asset, Protection, Specialist, is, prim...","[[the, asset, protection, specialist, is, prim..."
7,64426,East West Bank,# Job Description East West Bank is one of the...,California,34.06862,-118.02757,,We are currently seeking a Customer Service Ce...,"[[#, Job, Description, East, West, Bank, is, o...","[[job, description, east, west, bank, is, one,..."
8,245192,IBM,Job Description IBM is seeking to hire a Senio...,,,,,,"[[Job, Description, IBM, is, seeking, to, hire...","[[job, description, ibm, is, seeking, to, hire..."
9,202429,Hitachi Data Systems,Title: Field Solutions Engineer Location: New ...,,,,,Job Functions;Specific duties in this role wil...,"[[Title, :, Field, Solutions, Engineer, Locati...","[[title, field, solutions, engineer, location,..."


Let's define a function to calculate the likelihood of each job description. The idea is borrowed from [Matt Taddy](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/deepir.ipynb), who shows how a document can be characterized as the inner product of the distance between its words. In other words, this analysis will show which job ads are most likely to find an appropriate pool of workers in the resume bank that generated our word embedding.  

In [15]:
def adprob(ad, model):
    sen_scores = model.score(ad, len(ad))
    ad_score = sen_scores.mean()
    return ad_score

Let's apply this function to every job description.

In [16]:
sampleDF['likelihood'] = sampleDF['normalized_sents'].apply(lambda x: adprob(x, resume_model))

Let's take a look at the top 5 job descriptions that have the highest likelihood.

In [17]:
for ad in sampleDF.sort_values(by = 'likelihood', ascending = False)['jobDescription'][:5]:
    print (ad + '\n\n')

Project Engineering including below jobs: 1. Hardware designing of DCS 2. Software configurations, programming, testing of DCS/PLC 3. Testing and FAT 4. Installation and commissioning. 5. Material ordering, approvals of datasheets. 6. HSE compliance as per HSE directives of HON. BE / B. Tech - Instrumentation / Control / Electronics. **Job:** **Engineering* **Title:** *Systems Engineer* **Location:** *IND-MH-Pune* **Requisition ID:** *00302235*


Like talking on the phone? Enjoy giving great customer service? Use those skills while working flexible,part time hours.


*# Positions:* 2 *Location:* US - UT - Orem *Category:* Engineering


Title: Respiratory Therapy, Intern Location: XX-XX-XX


Title: Position Opening at Illinois Wesleyan University Location: US-IL-Bloomington




Let's take a look at the bottom 5 job descriptions that have the lowest likelihood to be matched by the resumes.

In [None]:
for ad in sampleDF.sort_values(by = 'likelihood')['jobDescription'][:5]:
    print (ad + '\n\n')

We can do the same for phrases corresponding to job skills.

In [None]:
adprob([["python", "programming"]], resume_model)

In [None]:
adprob([["basic", "programming"]], resume_model)

Basic programming appears to be more likely in this pool of resumes than python programming. 

We can also do some simple statistics. Unfortunately, we don't have a large sample here. Nevertheless, let's first look at the mean likelihood score of each hiring organization. Some organizations will do well to hire on CareerBuilder...while others will not.

In [None]:
sampleDF.groupby("hiringOrganization_organizationName")[['likelihood']].mean().sort_values('likelihood', ascending = False)

We can also look at the mean likelihood of each state.

In [None]:
sampleDF.groupby("jobLocation_address_region")[['likelihood']].mean().sort_values('likelihood', ascending = False)

You would increase the sample size if you want to do a more serious study.

## <span style="color:red">*Exercise 1a*</span>

<span style="color:red">**Do only 1a or 1b.** Construct cells immediately below this that calculate the scores for a small sample of documents from outside your corpus to identify which are *closest* to your corpus. Then calculate the scores for a few phrases or sentences to identify the ones most likely to have appeared in your corpus. Interrogate patterns associated with these document/phrase scores (e.g., which companies produced job ads most or least likely to find jobseekers in the resume corpus?) What do these patterns suggest about the boundaries of your corpus?

<span style="color:green"> My dataset contains the most popular 1000 novels. For this exercise, I look for the least popular novels. I scraped the second last page of the male and female ranking board to make sure that there are 50 novels on each of the page (so it is more structured).



In [5]:
#Scraping more books 
BookList_male= []

#I take the second last page to make sure that there are 50 books on the page
rank_url= "https://www.qidian.com/all?size=2&orderId=&style=2&pageSize=50&siteid=1&pubflag=0&hiddenField=0&page=280"

page_request= requests.get(rank_url)
page_soup= BeautifulSoup(page_request.text, "lxml")

url_list= page_soup.body.find("div", attrs= {"class": "all-book-list"})\
           .find_all("a", attrs= {"class": "name"}) #Continuing the previous line!!!
for i in range(50): 
    a_book= [url_list[i]["href"]]
    BookList_male.append(a_book)

#Output of this loop is links of the books on the ranking broad.
#Output is list within list
#[[book1's url, number of being bookmarked], [book2's url, number of being bookmarked], etc]
print(len(BookList_male))
print(BookList_male[:10])

#=====================================================================================
BookList_female= []

#I take the second last page to make sure that there are 50 books on the page
rank_url= "https://www.qidian.com/mm/all?size=2&orderId=&style=2&pageSize=50&siteid=0&pubflag=0&hiddenField=0&page=195"

page_request= requests.get(rank_url)
page_soup= BeautifulSoup(page_request.text, "lxml")

url_list= page_soup.body.find("div", attrs= {"class": "all-book-list"})\
           .find_all("a", attrs= {"class": "name"}) #Continuing the previous line!!!
for i in range(50): 
    a_book= [url_list[i]["href"]]
    BookList_female.append(a_book)

#Output of this loop is links of the books on the ranking broad.
#Output is list within list
#[[book1's url, number of being bookmarked], [book2's url, number of being bookmarked], etc]
print(len(BookList_female))
print(BookList_female[:10])

50
[['//book.qidian.com/info/1003683686'], ['//book.qidian.com/info/1003690577'], ['//book.qidian.com/info/1003692354'], ['//book.qidian.com/info/1003692242'], ['//book.qidian.com/info/1003708434'], ['//book.qidian.com/info/1003707566'], ['//book.qidian.com/info/1003710138'], ['//book.qidian.com/info/1003712764'], ['//book.qidian.com/info/1003713093'], ['//book.qidian.com/info/1003713099']]
50
[['//book.qidian.com/info/1004120556'], ['//book.qidian.com/info/1004120722'], ['//book.qidian.com/info/1004120871'], ['//book.qidian.com/info/1004121114'], ['//book.qidian.com/info/1004121151'], ['//book.qidian.com/info/1004121656'], ['//book.qidian.com/info/1004122213'], ['//book.qidian.com/info/1004122422'], ['//book.qidian.com/info/1004122571'], ['//book.qidian.com/info/1004124134']]


In [6]:
def soup_a_book(a_book_url):
    ##Extracting book introduction from a webpage+ Using Beautifulsoup to parse a book's web page
    a_book_request= requests.get(a_book_url)
    a_book_soup= BeautifulSoup(a_book_request.text, 'lxml')
    return a_book_soup



def get_text(a_book_soup):
    #Input is the soup of the introduction page of the book 
    #Obtain 10 chapters as example writing, 10 chapters are stored in one dictionary
    
    #Using the introduction page to get the the "read for free (= first page)"
    def get_content(for_read_url):
        read_url= "https:"+ for_read_url

        ch= requests.get(read_url)
        ch_soup= BeautifulSoup(ch.text, 'lxml')
        #Strange!!! I followed the teacher's code and use "html.parser",
        #and I get a TypeError called:
        #TypeError: 'NoneType' object is not subscriptable
        #This problem is no longer there, if I somehow use "lxml"
        #I find this out completely by accident! Pure luck!

        chP= ch_soup.body.find("div", attrs= {"class": "read-content"}).find_all("p")
        chP_joined= ""
        for p in chP:
            chP_joined+= str(p)   
        chP_joined= chP_joined.replace("\u3000", "")
        #print(for_chP)
        ch_content= re.sub(r"(<\/*p>)(\1*)", "", chP_joined) #(Regular Expression 5)
        ch_content= ch_content.strip()
        return ch_content, ch_soup
    
    content_dic= {}
    i= 1 
    while i< 31:
        if i== 1:
            for_read_url= a_book_soup.body.find("a", text= "免费试读")["href"] 
            ch_content, ch_soup = get_content(for_read_url)
            content_dic[i]= ch_content
            i+= 1
        else:
            for_read_url= ch_soup.body.find("a", text= "下一章")["href"]
            #print(i, for_read_url)
            ch_content, ch_soup = get_content(for_read_url)
            content_dic[i]= ch_content
            i+= 1
    
    return content_dic

def deal_with_a_book(a_book_url):
    a_book_soup= soup_a_book(a_book_url)

    content_dic= get_text(a_book_soup)
    
    a_book_dic= {"example_text": content_dic}
    helper= [a_book_dic]
    bookDF= pd.DataFrame(data= helper)
    return bookDF, a_book_soup

In [8]:
t0 = time.time()

test_male= pd.DataFrame({})

for book in BookList_male[:10]:
    book_url= "https:"+ book[0]
    a_bookDF, a_book_soup= deal_with_a_book(book_url)
    #The "number of being bookmarked" information is not available on the
    #book introduction page. This information is available on the book ranking page. 
    #Thus, I add a column for each book about its number of being bookmarked:

    test_male= test_male.append(a_bookDF)
    test_male.to_pickle("test_male.pickle")
        
t1 = time.time()
total = t1-t0
print(total)

531.9532141685486


In [15]:
t0 = time.time()

test_female= pd.DataFrame({})

for book in BookList_female[:10]:
    book_url= "https:"+ book[0]
    a_bookDF, a_book_soup= deal_with_a_book(book_url)
    #The "number of being bookmarked" information is not available on the
    #book introduction page. This information is available on the book ranking page. 
    #Thus, I add a column for each book about its number of being bookmarked:

    test_female= test_female.append(a_bookDF)
    test_female.to_pickle("test_female.pickle")
        
t1 = time.time()
total = t1-t0
print(total)

528.1364879608154


In [16]:
def tokenize_ch(thirty_ch):
    #Take the 30-chapter dictionary as input
    #Tokenize the 30 chapters, combine them and output a list 
    thirty_ch_list= []
    for ch in thirty_ch.keys():
        ch_list= jieba.lcut(thirty_ch[ch], cut_all= False)
        thirty_ch_list+= ch_list
    return thirty_ch_list


test_male["example_token"]= test_male["example_text"].apply(lambda x: tokenize_ch(x))
test_male["example_token"]

test_female["example_token"]= test_female["example_text"].apply(lambda x: tokenize_ch(x))
test_female["example_token"]


0    [回忆, ，, 如果, 带给, 人, 的, 是, 难以, 抹, 去, 的, 痛苦, ，, 这...
0    [微风, 徐徐, 的, 吹, 过, 校园, ，, 我, 抬头, 望, 着, 天空, ，, 秋...
0    [毛玉, ，, M, 市, 第一, 人民, 医院, 最, 年轻漂亮, 的, 副, 主任医师,...
0    [落红, 还, 没, 完全, 化作, 春泥, ，, 石榴, 已有, 了, 笑意, 。, 接踵...
0    [二十一, 世纪, ，, A, 市, 一个, 都市, 村庄, 内, ，, 叶, 清雅, 就,...
0    [如果, 你, 要, 问, 最近, 有没有, 什么, 热门话题, ，, 那, 必定, 是, ...
0    [身上, 很, 疼, ，, 尤其, 是, 脑袋, 。,  ,  , 周, 浅浅, 吧嗒, 着...
0    [这个, 是, 什么, ？, 这个, 又, 是, 什么, ？, 这个, ！, “, 终于, ...
0    [“, 现在, 可以, 说, 是, 怎么回事, 了, 吧, ？, ”, 飞儿, 坐在, 车里...
0    [“, 雨笑, ，, 喂, …, …, ！, 雨笑, …, …, …, …, …, …, ”...
Name: example_token, dtype: object

<span style="color:green"> I did not save the stop words, and I really do not want to run everything again just to construct the stop word list. So I decide to just copy and paste them here...

In [17]:
stop_words= ['$', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', '_', '“', 
             '”', '、', '。', '《', '》', '一', '一些', '一何', '一则', '一方面', '一旦', 
             '一来', '一样', '一般', '万一', '上', '上下', '下', '不', '不仅', '不但', '不光', 
             '不单', '不只', '不外乎', '不如', '不妨', '不尽', '不尽然', '不得', '不怕', '不惟', 
             '不成', '不拘', '不料', '不是', '不比', '不然', '不特', '不独', '不管', '不至于', 
             '不若', '不论', '不过', '不问', '与', '与其', '与其说', '与否', '与此同时', '且', 
             '且不说', '且说', '两者', '个', '个别', '临', '为', '为了', '为什么', '为何', '为止', 
             '为此', '为着', '乃', '乃至', '乃至于', '么', '之', '之一', '之所以', '之类', '乌乎', 
             '乎', '乘', '也', '也好', '也罢', '了', '二来', '于', '于是', '于是乎', '云云', '云尔', 
             '些', '亦', '人', '人们', '人家', '什么', '什么样', '今', '介于', '仍', '仍旧', '从', 
             '从此', '从而', '他人', '以', '以上', '以为', '以便', '以免', '以及', '以故', '以期', 
             '以来', '以至', '以至于', '以致', '们', '任', '任何', '任凭', '似的', '但', '但凡', 
             '但是', '何', '何以', '何况', '何处', '何时', '余外', '作为', '你', '你们', '使', 
             '使得', '例如', '依', '依据', '依照', '便于', '俺', '俺们', '倘', '倘使', '倘或', 
             '倘然', '倘若', '借', '假使', '假如', '假若', '傥然', '像', '儿', '先不先', '光是', 
             '全体', '全部', '兮', '关于', '其', '其一', '其中', '其二', '其他', '其余', '其它', 
             '其次', '具体地说', '具体说来', '兼之', '内', '再', '再其次', '再则', '再有', '再者', 
             '再者说', '再说', '冒', '冲', '况且', '几', '几时', '凡', '凡是', '凭', '凭借', '出于', 
             '出来', '分别', '则', '则甚', '别', '别人', '别处', '别是', '别的', '别管', '别说', '到', 
             '前后', '前此', '前者', '加之', '加以', '即', '即令', '即使', '即便', '即如', '即或', 
             '即若', '却', '去', '又', '又及', '及', '及其', '及至', '反之', '反而', '反过来', 
             '反过来说', '受到', '另', '另一方面', '另外', '另悉', '只', '只当', '只怕', '只是', 
             '只有', '只消', '只要', '只限', '叫', '可', '可以', '可是', '可见', '各', '各个', 
             '各位', '各种', '各自', '同', '同时', '后', '后者', '向', '向使', '向着', '吓', '吗', 
             '否则', '吧', '吧哒', '吱', '呀', '呃', '呕', '呗', '呜', '呜呼', '呢', '呵', '呵呵', 
             '呸', '呼哧', '咋', '和', '咚', '咦', '咧', '咱', '咱们', '咳', '哇', '哈', '哈哈', '哉', 
             '哎', '哎呀', '哎哟', '哗', '哟', '哦', '哩', '哪', '哪个', '哪些', '哪儿', '哪天', 
             '哪年', '哪怕', '哪样', '哪边', '哪里', '哼', '哼唷', '唉', '唯有', '啊', '啐', '啥', 
             '啦', '啪达', '啷当', '喂', '喏', '喔唷', '喽', '嗡', '嗡嗡', '嗬', '嗯', '嗳', '嘎', 
             '嘎登', '嘘', '嘛', '嘻', '嘿', '嘿嘿', '因', '因为', '因了', '因此', '因着', '因而', 
             '固然', '在', '在下', '在于', '地', '基于', '处在', '多', '多么', '多少', '大', '大家', 
             '好', '如', '如上', '如上所述', '如下', '如何', '如其', '如同', '如是', '如果', '如此', 
             '如若', '始而', '孰料', '孰知', '宁', '宁可', '宁愿', '宁肯', '它', '它们', '对', '对于', 
             '对待', '对方', '对比', '将', '小', '尔', '尔后', '尔尔', '尚且', '就', '就是', '就是了', 
             '就是说', '就算', '就要', '尽', '尽管', '尽管如此', '岂但', '己', '已', '已矣', '巴', 
             '巴巴', '并', '并且', '并非', '庶乎', '庶几', '开外', '开始', '归', '归齐', '当', 
             '当地', '当然', '当着', '彼', '彼时', '彼此', '往', '待', '很', '得', '得了', '怎', 
             '怎么', '怎么办', '怎么样', '怎奈', '怎样', '总之', '总的来看', '总的来说', '总的说来', 
             '总而言之', '恰恰相反', '您', '惟其', '慢说', '我', '我们', '或', '或则', '或是', 
             '或曰', '或者', '截至', '所', '所以', '所在', '所幸', '所有', '才', '才能', '打', 
             '打从', '把', '抑或', '拿', '按', '按照', '换句话说', '换言之', '据', '据此', '接着', 
             '故', '故此', '故而', '旁人', '无', '无宁', '无论', '既', '既往', '既是', '既然', 
             '时候', '是', '是以', '是的', '曾', '替', '替代', '有', '有些', '有关', '有及', '有时', 
             '有的', '望', '朝', '朝着', '本', '本人', '本地', '本着', '本身', '来', '来着', '来自', 
             '来说', '极了', '果然', '果真', '某', '某个', '某些', '某某', '根据', '欤', '正值', '正如', 
             '正巧', '正是', '此', '此地', '此处', '此外', '此时', '此次', '此间', '毋宁', '每', '每当', 
             '比', '比及', '比如', '比方', '没奈何', '沿', '沿着', '漫说', '焉', '然则', '然后', '然而', 
             '照', '照着', '犹且', '犹自', '甚且', '甚么', '甚或', '甚而', '甚至', '甚至于', '用', '用来', 
             '由', '由于', '由是', '由此', '由此可见', '的', '的确', '的话', '直到', '相对而言', '省得', 
             '看', '眨眼', '着', '着呢', '矣', '矣乎', '矣哉', '离', '竟而', '第', '等', '等到', '等等', 
             '简言之', '管', '类如', '紧接着', '纵', '纵令', '纵使', '纵然', '经', '经过', 
             '结果', '给', '继之', '继后', '继而', '综上所述', '罢了', '者', '而', '而且', '而况', 
             '而后', '而外', '而已', '而是', '而言', '能', '能否', '腾', '自', '自个儿', '自从', 
             '自各儿', '自后', '自家', '自己', '自打', '自身', '至', '至于', '至今', '至若', '致', 
             '般的', '若', '若夫', '若是', '若果', '若非', '莫不然', '莫如', '莫若', '虽', '虽则', 
             '虽然', '虽说', '被', '要', '要不', '要不是', '要不然', '要么', '要是', '譬喻', '譬如', 
             '让', '许多', '论', '设使', '设或', '设若', '诚如', '诚然', '该', '说来', '诸', '诸位', 
             '诸如', '谁', '谁人', '谁料', '谁知', '贼死', '赖以', '赶', '起', '起见', '趁', '趁着', 
             '越是', '距', '跟', '较', '较之', '边', '过', '还', '还是', '还有', '还要', '这', 
             '这一来', '这个', '这么', '这么些', '这么样', '这么点儿', '这些', '这会儿', '这儿', 
             '这就是说', '这时', '这样', '这次', '这般', '这边', '这里', '进而', '连', '连同', 
             '逐步', '通过', '遵循', '遵照', '那', '那个', '那么', '那么些', '那么样', '那些', 
             '那会儿', '那儿', '那时', '那样', '那般', '那边', '那里', '都', '鄙人', '鉴于', '针对',
             '阿', '除', '除了', '除外', '除开', '除此之外', '除非', '随', '随后', '随时', '随着', 
             '难道说', '非但', '非徒', '非特', '非独', '靠', '顺', '顺着', '首先', '！', '，', '：',
             '；', '？'
             '，', '。', ' ', '…', '《', '》', '！', '？', '‘', '’', '、', '：', ':', '２', 
             '（', '）', '—', '*', '；', '“', '”', '\r', '!', '?', ',', '=', '【', '】', '~', 
             'V', '％', '-', 'ｃ', 'X', 'ｅ', '^', '_', '<', 'a', '"', '/', '#', '>', 'Ｐ', 'Ｓ', 
             '＜', '＞', '+', '%', '～', '.', '##########', '[', ']', '＝', '·', 'ㄟ', '(', '▔', ')', 
             'ㄏ', '－', '2', 'ｍ', '@', '８', '６', '０', 'の', '剣', 'を', '喰', 'ら', 'え', '啾', 
             '|', '▍', '1', '╰', '☆', '＋', '∽', '\xa0', '3', '0', '&#', '�', '╄', '\\', 'Ⅱ', 
             'u', '①', '②', '③', '④', '．', 'Ｙ', '{', '}', '###########################', '4', 
             '6', '7', '8', 'A', 'B', '&', '┃', 'ゞ', '「', '」', '狷', '→', ';', '※', 
             '+++++++++++++++++++++++++++++++++++++++++++++++', 
              '∝', '灞', '＊', 'Ｑ', '+++++++++++++++++++++++++++++++++++++']

In [18]:
def remove_stopw(wordlist, stopw):
    cleaned_list= []
    for w in wordlist:
        if w not in stopw:
            cleaned_list+= [w]
    return cleaned_list

In [34]:
test_male["example_token_clean"]= test_male["example_token"].apply(lambda x: remove_stopw(x, stop_words))
test_female["example_token_clean"]= test_female["example_token"].apply(lambda x: remove_stopw(x, stop_words))
test_DF= pd.concat([test_male[["example_token_clean", "example_text"]], 
                    test_female[["example_token_clean", "example_text"]]],
                  axis= 0)

<span style="color:green"> For the errors below... I think this exercise is not going to work for me right now.

<span style="color:green"> I do not understand the first error. The model of the example is also of the class  'gensim.models.word2vec.Word2Vec', so is my model. But then "'Word2Vec' object has no attribute 'index2word'" when it comes to my model. Confused...

<span style="color:green"> For the second error, I choose not to fix it. The error tells me to set hs= 1 in the model training for this operation to work. But Linzhuo noticed that there is some problems with the model when hs= 1: The training loss continuously increases rather than decreases as the training epoch increases. I do not want to set hs to 1 since I don't know what is wrong and do not trust the hs= 1 model. If the trained model is suspecious (by setting hs= 1), then it doesn't make sense to compare the new data with the original model? 

In [43]:
book_model  = gensim.models.word2vec.Word2Vec.load('../bookW2V_loss.model')
print(type(book_model))
vocab = book_model.index2word

<class 'gensim.models.word2vec.Word2Vec'>


AttributeError: 'Word2Vec' object has no attribute 'index2word'

In [42]:
#This is from the example above. 
#Do not understand why it works for the example but not for mine. 
resume_model  = gensim.models.word2vec.Word2Vec.load('../data/resumeAll.model')
print(type(resume_model))
testing_vocab= resume_model.index2word

<class 'gensim.models.word2vec.Word2Vec'>


In [36]:
def adprob(ad, model):
    sen_scores = model.score(ad, len(ad))
    ad_score = sen_scores.mean()
    return ad_score

In [37]:
test_DF['likelihood'] = test_DF['example_token_clean'].apply(lambda x: adprob(x, book_model))

RuntimeError: We have currently only implemented score for the hierarchical softmax scheme, so you need to have run word2vec with hs=1 and negative=0 for this to work.

In [None]:
for ad in test_DF.sort_values(by = 'likelihood', ascending = False)['example_text'][:5]:
    print (ad + '\n\n')

In [None]:
for ad in test_DF.sort_values(by = 'likelihood')['example_text'][:5]:
    print (ad + '\n\n')

# Linguistic Change

Below is code that aligns the dimensions of multiple embeddings arrayed over time or some other dimension and allow identification of semantic chanage as the word vectors change their loadings for focal words. This code comes from the approach piloted at Stanford by William Hamilton, Daniel Jurafsky and Jure Lescovec [here](https://arxiv.org/pdf/1605.09096.pdf). 

In [None]:
def calc_syn0norm(model):
    """since syn0norm is now depricated"""
    return (model.wv.syn0 / np.sqrt((model.wv.syn0 ** 2).sum(-1))[..., np.newaxis]).astype(np.float32)

def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
    (With help from William. Thank you!)
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """
    base_embed = copy.copy(base_embed)
    other_embed = copy.copy(other_embed)
    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the embedding matrices
    base_vecs = calc_syn0norm(in_base_embed)
    other_vecs = calc_syn0norm(in_other_embed)

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one
    # i.e. multiplying the embedding matrix (syn0norm)by "ortho"
    other_embed.wv.syn0norm = other_embed.wv.syn0 = (calc_syn0norm(other_embed)).dot(ortho)
    return other_embed
    
def intersection_align_gensim(m1,m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.vocab.keys())
    vocab_m2 = set(m2.wv.vocab.keys())

    # Find the common vocabulary
    common_vocab = vocab_m1&vocab_m2
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.vocab[w].count + m2.wv.vocab[w].count,reverse=True)

    # Then for each model...
    for m in [m1,m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.vocab[w].index for w in common_vocab]
        old_arr = calc_syn0norm(m)
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.syn0norm = m.wv.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.index2word = common_vocab
        old_vocab = m.wv.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.wv.vocab = new_vocab

    return (m1,m2)

In order to explore this, let's get some data that follows a time trend. We'll look at conference proceedings from the American Society for Clinical Oncologists.

In [None]:
ascoDF = pandas.read_csv("../data/ASCO_abstracts.csv", index_col=0)

Prepare for wor2vec

In [None]:
ascoDF['tokenized_sents'] = ascoDF['Body'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
ascoDF['normalized_sents'] = ascoDF['tokenized_sents'].apply(lambda x: [lucem_illud.normalizeTokens(s, stopwordLst = lucem_illud.stop_words_basic) for s in x])

We will be creating many embeddings so we have created this function to do most of the work. It creates two collections of embeddings, one the original and one the aligned.

In [None]:
def compareModels(df, category, sort = True):
    """If you are using time as your category sorting is important"""
    embeddings_raw = {}
    cats = sorted(set(df[category]))
    for cat in cats:
        #This can take a while
        print("Embedding {}".format(cat), end = '\r')
        subsetDF = df[df[category] == cat]
        #You might want to change the W2V parameters
        embeddings_raw[cat] = gensim.models.word2vec.Word2Vec(subsetDF['normalized_sents'].sum())
    #These are much quicker
    embeddings_aligned = {}
    for catOuter in cats:
        embeddings_aligned[catOuter] = [embeddings_raw[catOuter]]
        for catInner in cats:
            embeddings_aligned[catOuter].append(smart_procrustes_align_gensim(embeddings_aligned[catOuter][-1], embeddings_raw[catInner]))
    return embeddings_raw, embeddings_aligned

Now we generate the models

In [None]:
rawEmbeddings, comparedEmbeddings = compareModels(ascoDF, 'Year')

We need to compare them across all permutions so we will define another function to help, we will be using 1 - cosine similarity as that gives a more intitive range of 0-2 with low values meaning little change and high meaning lots of change

In [None]:
def getDivergenceDF(word, embeddingsDict):
    dists = []
    cats = sorted(set(embeddingsDict.keys()))
    dists = {}
    for cat in cats:
        dists[cat] = []
        for embed in embeddingsDict[cat][1:]:
            dists[cat].append(np.abs(1 - sklearn.metrics.pairwise.cosine_similarity(np.expand_dims(embeddingsDict[cat][0][word], axis = 0),
                                                                             np.expand_dims(embed[word], axis = 0))[0,0]))
    return pandas.DataFrame(dists, index = cats)

Lets look at a couple words

In [None]:
targetWord = 'breast'

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

In [None]:
targetWord = 'triple'

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

We can also ask which words changed the most

In [None]:
def findDiverence(word, embeddingsDict):
    cats = sorted(set(embeddingsDict.keys()))
    
    dists = []
    for embed in embeddingsDict[cats[0]][1:]:
        dists.append(1 - sklearn.metrics.pairwise.cosine_similarity(np.expand_dims(embeddingsDict[cats[0]][0][word], axis = 0), np.expand_dims(embed[word], axis = 0))[0,0])
    return sum(dists)

def findMostDivergent(embeddingsDict):
    words = []
    for embeds in embeddingsDict.values():
        for embed in embeds:
            words += list(embed.wv.vocab.keys())
    words = set(words)
    print("Found {} words to compare".format(len(words)))
    return sorted([(w, findDiverence(w, embeddingsDict)) for w in words], key = lambda x: x[1], reverse=True)
    

In [None]:
wordDivergences = findMostDivergent(comparedEmbeddings)

The most divergent words are:

In [None]:
wordDivergences[:10]

And the least

In [None]:
wordDivergences[-10:]

In [None]:
targetWord = wordDivergences[0][0]

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

In [None]:
targetWord = wordDivergences[-1][0]

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

## <span style="color:red">*Exercise 1b*</span>

<span style="color:red">**Do only 3a or 3b.** Construct cells immediately below this that align word embeddings over time. Interrogate the spaces that result and ask which words change most of the whole period. What does this reveal about the social game underlying your space?