In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pathlib import Path


basename = "enwiki-2016"
cat_file = Path("/data/graphs") / basename / "page2cat.tsv"
cat_list = []
ps = PorterStemmer()
for pos, line in enumerate(cat_file.open(encoding="utf-8")):
    temp = line.rstrip()
    words = word_tokenize(temp)
    final = [ps.stem(i) for i in words if i not in stopwords.words('english')]
    print(temp)
    print(final)
    if pos > 2:
        break

Anarchism	Political ideologies	Social theories	Political culture	Anti-capitalism	Far-left politics	Anarchism
['anarch', 'polit', 'ideolog', 'social', 'theori', 'polit', 'cultur', 'anti-capit', 'far-left', 'polit', 'anarch']
Değnek	Villages by country	Regions of Turkey	Populated places in Turkey by province
['değnek', 'villag', 'countri', 'region', 'turkey', 'popul', 'place', 'turkey', 'provinc']
Queensland Conservatorium Griffith University	Universities by country	Entertainment in Australia	Australian capital cities
['queensland', 'conservatorium', 'griffith', 'univers', 'univers', 'countri', 'entertain', 'australia', 'australian', 'capit', 'citi']
Octagon Chapel, Liverpool	Churches	Buildings and structures in England by city
['octagon', 'chapel', ',', 'liverpool', 'church', 'build', 'structur', 'england', 'citi']


In [3]:
%load_ext line_profiler

In [11]:
from collections import defaultdict


def build_cat_dict(cat_file):
    categories = defaultdict(list)
    for pos, line in enumerate(cat_file.open(encoding="utf-8")):
        temp = line.rstrip().split("\t", 1)
        try:
            key, raw_cat = temp[0], temp[1]
        except IndexError:
            continue
        categories[key] = preprocess_categs(raw_cat)
        if pos > 100:
            break
    return categories


def preprocess_categs(raw_categs):
    words = word_tokenize(raw_categs)
    doc = []
    for i in words:
        if i not in stopwords.words('english'):
            doc.append(ps.stem(i))
    return doc

In [12]:
%lprun -f build_cat_dict build_cat_dict(cat_file)

In [13]:
%lprun -f preprocess_categs build_cat_dict(cat_file)

In [14]:
def preprocess_categs(raw_categs):
    words = word_tokenize(raw_categs)
    return [ps.stem(w) for w in words]

In [15]:
%lprun -f preprocess_categs build_cat_dict(cat_file)

In [16]:
def build_cat_dict(cat_file):
    categories = defaultdict(list)
    stopw = set(stopwords.words('english'))
    for pos, line in enumerate(cat_file.open(encoding="utf-8")):
        temp = line.rstrip().split("\t", 1)
        try:
            key, raw_cat = temp[0], temp[1]
        except IndexError:
            continue
        categories[key] = preprocess_categs(raw_cat, stopw)
        if pos > 100:
            break
    return categories


def preprocess_categs(raw_categs, stopwords):
    words = word_tokenize(raw_categs)
    return [ps.stem(w) for w in words if w not in stopwords]

In [17]:
%lprun -f preprocess_categs build_cat_dict(cat_file)

In [20]:
from tqdm import tqdm


def build_cat_dict(cat_file):
    categories = defaultdict(list)
    stopw = set(stopwords.words('english'))
    num_lines = sum(1 for _ in cat_file.open(encoding="utf-8"))
    for pos, line in enumerate(tqdm(cat_file.open(encoding="utf-8"), total=num_lines)):
        temp = line.rstrip().split("\t", 1)
        try:
            key, raw_cat = temp[0], temp[1]
        except IndexError:
            continue
        categories[key] = preprocess_categs(raw_cat, stopw)
    return categories

In [None]:
categs = build_cat_dict(cat_file)

 18%|█▊        | 792668/4289768 [06:31<28:31, 2042.79it/s]

In [25]:
len(categs)

3192605

In [27]:
categs['Anarchism']

['polit',
 'ideolog',
 'social',
 'theori',
 'polit',
 'cultur',
 'anti-capit',
 'far-left',
 'polit',
 'anarch']

# TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
docs = [' '.join(i) for i in categs.values()]
len(docs)

3192605

In [37]:
for pos, value in enumerate(docs):
    print(' '.join(value))
    if pos > 5:
        break

polit ideolog social theori polit cultur anti-capit far-left polit anarch
villag countri region turkey popul place turkey provinc
univers countri entertain australia australian capit citi
church build structur england citi
peopl statu peopl ethnic peopl ethnic occup indian peopl film director
scottish societi alumni univers colleg europ poetri nation languag scottish peopl occup
plant


In [44]:
sklearn_tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
sklearn_representation = sklearn_tfidf.fit_transform(docs)
type(sklearn_representation)

In [47]:
sklearn_representation.shape

(3192605, 2461)

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

x1 = sklearn_representation[0]
x2 = sklearn_representation[1]
cosine_similarity(x1, x2)

array([[0.]])

Learning it right:

In [87]:
key_list = []
docs_list = []
for key, value in categs.items():
    key_list.append(key)
    docs_list.append(' '.join(value))

In [88]:
doc_repr = sklearn_tfidf.fit_transform(docs_list)

# Plots

In [50]:
import sys

p = Path('.').resolve()
sys.path.append(str(p.parent))
from utils.data_utils import load_data

In [51]:
x, y = load_data(Path("/data/models/enwiki-2016"))

Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined
Labels not defined


In [121]:
import numpy as np
import scipy
from scipy.spatial.distance import pdist


def get_id(basename, idx, ent_list):
    """
    Idx is currently a list.
    """
    ids_file = Path('/data/graphs/') / basename / (basename + '.urls')
    if not ids_file.exists():
        ids_file = Path('/data/graphs/') / basename / (basename + '.ids')
    assert ids_file.exists(), "File not found!"
    f = ids_file.as_posix()
    for node in idx:
        line = ent_list[node]
        yield linecache.getline(f, line + 1).rstrip()



def cosineSim_vs_distance(x, categories, entities, key_list, doc_list, n=100):
    """
    """
    subset = np.random.choice(len(x), n, replace=False)
    ids = [i for i in get_id("enwiki-2016", subset, entities)]
    # polishing
    print("Removing nodes not contained in the categories dictionary..")
    indices = []
    for pos, value in enumerate(ids):
        if value in categories:
            if len(categories[value]) > 0: # this should always be true
                indices.append(pos)
    # keep only relevant ids and associated embeddings
    ids = [ids[i] for i in indices]
    subset = [subset[i] for i in indices]
    print("Effective number of nodes: {}".format(len(ids)))
    # compute
    mat = []
    for i in ids:
        idx = key_list.index(i)
        mat.append(doc_list[idx])
    print(len(mat))
    mat = scipy.sparse.vstack(mat)
    similarities_sparse = cosine_similarity(mat, dense_output=False)
    dist = pdist(x[subset], 'euclidean')
    return similarities_sparse, dist, ids


In [122]:
temp1, temp2, temp3 = cosineSim_vs_distance(x, categs, entities, key_list, doc_repr, n=10)

Removing nodes not contained in the categories dictionary..
Effective number of nodes: 7
7


In [117]:
temp1

<7x7 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [124]:
print(scipy.sparse.triu(temp1, k=1))

  (0, 6)	0.1757733968820497
  (2, 5)	0.13781724510952068


In [120]:
len(temp2)

21

In [None]:
points_x = scipy.sparse.triu(temp1, k=1)
plt.scatter(, js, s=1)
plt.xlabel("L2 distance")
plt.ylabel("Jaccard sym")