# Calculating Semantic Relatedness using Wikipedia
# Pre-Embedding Version

This is a simple and step by step explanation of calculating semantic relatedness using Wikipedia. We start by preprocessing and building the api, you need to have a mysql server up and running.

# Using Prepared Tables
## Download

Download the following files and decompress them to a dir (we assume the path to be ~/Downloads/wikidumps):

http://cgm6.research.cs.dal.ca/~sajadi/downloads/wikisim/enwiki-20160305-page.main.tsv.gz

http://cgm6.research.cs.dal.ca/~sajadi/downloads/wikisim/enwiki-20160305-redirect.main.tsv.gz

http://cgm6.research.cs.dal.ca/~sajadi/downloads/wikisim/enwiki-20160305-pagelinksorderedin.main.tsv.gz

http://cgm6.research.cs.dal.ca/~sajadi/downloads/wikisim/enwiki-20160305-pagelinksorderedout.main.tsv.gz

### Preparing mysql
Running the following script will set some variable in mysql for maximum performance (if you have enoguh physical memory. Replace \$1 and \$2 with the actuall user and password:

`bash setupmysql.sh <user> <pass>`


## Actuall Importing
Run:

```mysql -u <user> -p<pass> -e 'CREATE SCHEMA `enwiki20160305` DEFAULT CHARACTER SET binary;'```

`./importall  ~/Downloads/wikidumps enwiki20160305 <user> <pass>`

This might take several hours 



# Wikipedia Interface
This is the main interface to Wikipedia database and provides basic functions given a pages, such as its:

* id or title
* synonym ring
* linkage
* in or out neighborhood. 

**You might need to modify, user, password and portnumbers**


In [1]:
%%writefile wikismall.py 
"""A General Class to interact with Wiki datasets"""
# uncomment

import MySQLdb
import sys;
import os
import scipy as sp
import pandas as pd
#from collections import defaultdict
import cPickle as pickle

__author__ = "Armin Sajadi"
__copyright__ = "Copyright 215, The Wikisim Project"
__credits__ = ["Armin Sajadi", "Evangelo Milios", "Armin Sajadi"]
__license__ = "GPL"
__version__ = "1.0.1"
__maintainer__ = "Armin Sajadi"
__email__ = "sajadi@cs.dal.ca"
__status__ = "Development"


DISABLE_CACHE=False;
MAX_GRAPH_SIZE=1000000

DIR_IN=0;
DIR_OUT=1;
DIR_BOTH=2;
_db = MySQLdb.connect(host="127.0.0.1",port=3307,user='root',passwd="emilios",db="enwiki20160305")
_cursor = _db.cursor()
#WIKI_SIZE = 10216236;
#WIKI_SIZE = 13670498; #2016
WIKI_SIZE = 5576365; #no redirect, 2016
def close():
    global _db, _cursor;
    if _cursor is not None: 
        _cursor.close();
        _db.close();
    _cursor=_db=None;
def reopen():
    global _db, _cursor;
    if _db is None:
        _db = MySQLdb.connect(host="127.0.0.1",port=3307,user='root',passwd="emilios",db="enwiki20160305")
        _cursor = _db.cursor()
        

def id2title(wid):
    """ Returns the title for a given id

    Args: 
        wid: Wikipedia id       
    Returns: 
        The title of the page
    """
    title=None;

    _cursor.execute("""SELECT * FROM `page` where page_id = %s""", (wid,))
    row= _cursor.fetchone();
    if row is not None:
        title=row[2];          
    return title;

def ids2title(wids):
    """ Returns the titles for given list of wikipedia ids 

    Args: 
        wids: A list of Wikipedia ids          
    Returns: 
        The list of titles
    """

    wid_list = [str(wid) for wid in wids] ;
    order = ','.join(['page_id'] + wid_list) ;
    wid_str = ",".join(wid_list)
    query = "SELECT page_id, page_title FROM `page` where page_id in ({0})" \
    .format(wid_str, order);
    _cursor.execute(query);
    rows = _cursor.fetchall();
    rows_dict = dict(rows)
    titles = [rows_dict[wid] for wid in wids]
    return titles;

def encode_for_db(instr):
    if isinstance(instr, unicode):
        instr = instr.encode('utf-8')  
    return instr
        
def normalize_str(title):
    
    title = encode_for_db(title)
    title = title.replace(' ','_')
    return title
def title2id(title):
    """ Returns the id for a given title

    Args: 
        wid: Wikipedia id          
    Returns: 
        The title of the page
    """        
    wid=None;
    title = normalize_str(title)
    _cursor.execute("""SELECT * FROM `page` where page_title=%s and page_namespace=0""", (title,))
    row= _cursor.fetchone();
    if row is not None:
        wid = getredir_id(row[0]) if row[3] else row[0];
    return wid;


def getredir_id(wid):
    """ Returns the target of a redirected page 

    Args:
        wid: wikipedia id of the page
    Returns:
        The id of the target page
    """
    rid=None

    _cursor.execute("""select * from redirect where rd_from=%s;""", (wid,));
    row= _cursor.fetchone();
    if row is not None:
        rid=row[1]
    return rid 

def resolveredir(wid):
    tid = getredir_id(wid);
    if tid is not None:
        wid = tid;    
    return wid

def getredir_title(wid):
    """ Returns the target title of a redirected page 

    Args:
        wid: wikipedia id of the page
    Returns:
        The title of the target page
    """
    
    title=None;
    _cursor.execute(""" select page_title from redirect INNER JOIN page
                  on redirect.rd_to = page.page_id 
                  where redirect.rd_from =%s;""", (wid));
    row=_cursor.fetchone()
    if row is not  None:
        title=row[0];
    return title;

def synonymring_titles(wid):
    """ Returns the synonim ring of a page

    Example: synonymring_titles('USA')={('U.S.A', 'US', 'United_States_of_America', ...)}

    Args:
        wid: the wikipedia id
    Returns:
        all the titles in its synonym ring
    """
    wid = resolveredir(wid)
    _cursor.execute("""(select page_title from page where page_id=%s) union 
                 (select page_title from redirect INNER JOIN page
                    on redirect.rd_from = page.page_id 
                    where redirect.rd_to =%s);""", (wid,wid));
    rows=_cursor.fetchall();
    if rows:
        rows = tuple(r[0] for r in rows)
    return rows;



def checkcache(wid, direction):
    if DISABLE_CACHE:
        return None
    

    
    em=None
    
    if direction == DIR_IN: 
        tablename = 'pagelinksorderedin';
        colname = 'in_neighb'
    elif direction == DIR_OUT: 
        tablename = 'pagelinksorderedout';
        colname = 'out_neighb';
    query =    """select {0} from {1} where cache_id={2}""".format(colname, tablename, wid)
    _cursor.execute(query);
    row = _cursor.fetchone();
    if row is not None:
        values, index = pickle.loads(row[0])
        em=pd.Series(values, index=index)

    return em


Overwriting wikismall.py


In [2]:
%%writefile calcsimsmall.py 
"""Calculating Relatedness."""
# uncomment

from __future__ import division
import scipy.spatial


#from collections import defaultdict
from scipy import stats
import json
import math

from wikismall import * # uncomment

__author__ = "Armin Sajadi"
__copyright__ = "Copyright 215, The Wikisim Project"
__credits__ = ["Armin Sajadi", "Evangelo Milios", "Armin Sajadi"]
__license__ = "GPL"
__version__ = "1.0.1"
__maintainer__ = "Armin Sajadi"
__email__ = "sajadi@cs.dal.ca"
__status__ = "Development"


def concept_embedding(wid, direction):
    """ Calculates concept embedding to be used in relatedness
    
    Args:
        wid: wikipedia id
        direction: 0 for in, 1 for out, 2 for all
        
    Returns:
        The neighbor ids, their scores and the whole neighorhood graph (for visualization purposes)
        
    """
    if direction == DIR_IN or direction==DIR_OUT:
        em = _concept_embedding_io(wid, direction)
    if direction == DIR_BOTH:
        em = _concept_embedding_both(wid, direction)
    return em
    
def _concept_embedding_io(wid, direction):
    wid = resolveredir(wid)
    return checkcache(wid, direction);
            

def _concept_embedding_both(wid, direction):            
        in_em = _concept_embedding_io(wid, DIR_IN);
        out_em = _concept_embedding_io(wid, DIR_OUT )
        if (in_em is None) or (out_em is None):
            return None;
        return in_em.add(out_em, fill_value=0)/2


def getsim(id1,id2, direction):
    """ Calculates the similarity between two concepts
    Arg:
        id1, id2: the two concepts
        direction: 0 for in, 1 for out, 2 for all
        
    Returns:
        The similarity score
    """
    em1 = concept_embedding(id1, direction);
    em2 = concept_embedding(id2, direction);
    if em1.empty or em2.empty:
        return 0;
    
    em1, em2 = em1.align(em2, fill_value=0)
#     print em1
#     print em2
    return 1-sp.spatial.distance.cosine(em1.values,em2.values);

    

def conceptrep(wid, direction, get_titles=True, cutoff=None):
    """ Finds a representation for a concept
    
        Concept Representation is a vector of concepts with their score
    Arg:
        wid: Wikipedia id
        direction: 0 for in, 1 for out, 2 for all
        titles: include titles in the embedding (not needed for mere calculations)
        cutoff: the first top cutoff dimensions (None for all)
        
    Returns:
        the vecotr of ids, their titles and theirs scores. It also returns the
        graph for visualization purposes. 
    """
    
    
    em=concept_embedding(wid, direction);    
    if em.empty:
        return em;
    
    
    #ids = em.keys();
    
    if cutoff is not None:
        em = em.sort_values(ascending=False)
        em = em[:cutoff]
    if get_titles:
        em = pd.Series(zip(ids2title(em.index), em.values.tolist()), index=em.index)
    return em
    



Overwriting calcsimsmall.py


In [3]:
%load_ext autoreload
%autoreload 2
%aimport wikismall
%aimport calcsimsmall

from wikismall import * # uncomment
from calcsimsmall import *   # uncomment
# Examples
reopen()
direction = DIR_IN

page_title1 = 'Abortion' 
print ('page_title: ', page_title1)

page_id1 = title2id(page_title1)
print ("id: ", page_id1)

sr1 = synonymring_titles(page_id1)
print ("synonym ring: %s\n " % str(sr1[:5]))

rep1=conceptrep(page_id1, direction,  get_titles=True, cutoff=5)
print ("Concept Representation:  %s\n" % rep1.to_json())

print ("\n")

page_title2 = 'Miscarriage' 
print ('page_title: ', page_title2)

page_id2 = title2id(page_title2)
print ("id: ", page_id2)

sr2 = synonymring_titles(page_id2)
print ("synonym ring: %s\n " % str(sr2[:5]))

rep2=conceptrep(page_id2, direction,  get_titles=True, cutoff=5)
print ("Concept Representation: %s\n" % rep2.to_json())



sim = getsim(page_id1, page_id2,DIR_OUT)
print ("similarity", sim)



('page_title: ', 'Abortion')
('id: ', 765L)
synonym ring: ('Abortion', 'Termination_of_pregnancy', 'Abortions', 'Induced_abortion', 'Abortionist')
 
Concept Representation:  {"256024":["Abortion_in_the_United_States",0.0023123941],"2375414":["Anti-abortion_violence",0.0020914555],"1294820":["Feminists_for_Life",0.0020757316],"697618":["Abortion_law",0.0020078264],"295191":["List_of_people_from_Arkansas",0.0019970349]}



('page_title: ', 'Miscarriage')
('id: ', 144147L)
synonym ring: ('Miscarriage', 'Miscarreage', 'Threatened_abortion', 'Inevitable_abortion', 'Incomplete_abortion')
 
Concept Representation: {"144147":["Miscarriage",0.0059268791],"1771587":["Pregnancy",0.0056077155],"3528092":["ICD-10_Chapter_XV:_Pregnancy,_childbirth_and_the_puerperium",0.0052465678],"256024":["Abortion_in_the_United_States",0.0045073997],"765":["Abortion",0.0044663864]}

('similarity', 0.10109798220577459)


In [4]:
%load_ext autoreload
%autoreload

from calcsimsmall import *

import json
from IPython.display import Javascript

cre1 = conceptrep(title2id('Tehran'), DIR_OUT, get_titles=True, cutoff=5);
cre2 = conceptrep(title2id('Sanandaj'), DIR_OUT, get_titles=True, cutoff=5);


#runs arbitrary javascript, client-side
Javascript("""
           window.vizObj1={};window.vizObj2={};
           """.format(cre1.to_json(), cre2.to_json()))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<IPython.core.display.Javascript object>

In [5]:
%%javascript

require.config({
    paths: {
        d3:'//cgm6.research.cs.dal.ca/~sajadi/wikisim/js/d3',
        d3_cloud:'//cgm6.research.cs.dal.ca/~sajadi/wikisim/js/d3.layout.cloud',
        simple_draw:'//cgm6.research.cs.dal.ca/~sajadi/wikisim/js/simpledraw'

    }
});

<IPython.core.display.Javascript object>

In [6]:
%%javascript

function createWords(cp){

    var titles=[];
    var scores=[];

    for (var key in cp){ 
        if (cp.hasOwnProperty(key)) {
            titles.push(cp[key][0])
            scores.push(cp[key][1])
        }
    }
    var sum = scores.reduce(function(a, b) {return a + b;});
    var min = Math.min.apply(null, scores)
    var max = Math.max.apply(null, scores)
    
    scores=scores.map(function(a){return (a/sum)*90+20});
    var words=[];
    for (var i = 0; i<titles.length; i++) {
        words.push({"text":titles[i], "size": scores[i]})
    }
    return words;
}

var words1=createWords(window.vizObj1);
//element.text(JSON.stringify(words1));
var words2=createWords(window.vizObj2);
require(['d3','d3_cloud', 'simple_draw'], function(d3,d3_cloud, simple_draw){
    $("#chart1").remove();
    element.append("<div id='chart1' style='width:49%; height:500px; float:left; border-style:solid'> </div>");
    simpledraw(words1, chart1);
    
    $("#chart2").remove();
    element.append("<div id='chart2' style='width:49%; margin-left:2%; height:500px; float:left; border-style:solid'> </div>");
    simpledraw(words2, '#chart2');    
    
});    
    


<IPython.core.display.Javascript object>