In [120]:
import threading
import functools
import string
import queue
import time
import json
import numpy as np
import pandas as pd

import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture
from sklearn.neighbors import NearestNeighbors

import os

import requests
import bs4

In [8]:
def sort_dict(mydict):
    for k, v in mydict.items():
        if v is None:
            mydict[k] = 0
    return sorted(mydict.items(), key=lambda k: k[1], reverse=True)

In [74]:
def get_categories(links):
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    categories = {}
    cat_names = set()
    for link in links:
        name = link[0].replace("/wiki/", "")
        name = name.replace("_", " ")
        PARAMS = {
          "action": "query",
          "format": "json",
          "titles": name,
          "prop": "categories"
        }

        response = S.get(url=URL, params=PARAMS).json()
        page = next(iter(response['query']['pages'].values()))
        L = []
        if ('categories' in page):
            for category in page['categories']:
                cat = category['title'].replace("Category:", "")
                L.append(cat)
                cat_names.add(cat)
            categories[name] = L
        
    return categories, cat_names

In [129]:
# tokenize all catagories and find how words connects
# remove a, an, the, as, of, from, at, with, people, to, who, in, under, for
# remove categories with 'Articles' or 'Wikidata'
def tokenize_categories(categories):
    trivial = ['a', 'an', 'the', 'as', 'of', 'from', 'at', 'with', 'people', 'to', 'who', 'in', 'under', 'for', '&', 'A', 'on', 'or', 'by']
    saved_categories = {}
    all_words = {} # map word with number of people with that word
    tokenized_categories = {}
    for name in categories:
        words = set()
        L = []
        for c in categories[name]:
            if 'Articles' not in c and 'Wikidata' not in c and 'CS1' not in c and 'Pages' not in c and 'articles' not in c and 'pages' not in c:
                cleaned_str = str(c).replace("(", " ").replace(")", " ").replace("'s", "").replace(" births", "-born").lower()
                words |= set(cleaned_str.split())
                L.append(c)
        for i in trivial:
            if i in words:
                words.remove(i)
        tokenized_categories[name] = [w for w in words]
        saved_categories[name] = L
        for w in words:
            if w in all_words:
                all_words[w] += 1
            else:
                all_words[w] = 1
        
    return tokenized_categories, all_words, saved_categories

In [57]:
with open('name-visit-count-all.json', 'r') as f:
    visit = json.load(f)

s = sort_dict(visit)

In [65]:
top = s[:10000]

In [75]:
# categories, cat_names = get_categories(top)

In [85]:
"""with open('categories.json', 'w') as f1:
    json.dump(categories, f1)
with open('category_names.txt', 'w', encoding='utf-8') as f2:
    for line in cat_names:
       f2.write(line+"\n")"""

In [125]:
with open('categories.json', 'r') as f1:
    categories = json.load(f1)

In [130]:
tokenized_categories, all_words, saved_categories = tokenize_categories(categories)

In [117]:
with open('tokenized_categories.json', 'w') as f1:
    json.dump(tokenized_categories, f1)
with open('all_words.json', 'w') as f2:
    json.dump(all_words, f2)

In [70]:
with open('tokenized_categories.json', 'r') as f1:
   tokenized_categories = json.load(f1)
with open('all_words.json', 'r') as f2:
    all_words = json.load(f2)

In [73]:
selected_words = {key:val for key, val in all_words.items() if val > 6}

In [74]:
print(len(selected_words))
print(sort_dict(selected_words))

1633
[('american', 5752), ('21st-century', 5023), ('20th-century', 4180), ('male', 3349), ('film', 2808), ('actors', 2413), ('descent', 2359), ('deaths', 2175), ('actresses', 2065), ('television', 2041), ('english', 1341), ('singers', 1337), ('alumni', 1333), ('living', 1242), ('women', 1193), ('writers', 1082), ('players', 1004), ('african-american', 924), ('united', 723), ('elements', 718), ('ac', 718), ('british', 699), ('child', 680), ('female', 668), ('indian', 637), ('musicians', 607), ('new', 606), ('states', 605), ('football', 576), ('comedians', 530), ('winners', 511), ('university', 506), ('world', 494), ('and', 491), ('stage', 482), ('irish', 477), ('german', 435), ('voice', 431), ('california', 429), ('york', 417), ('businesspeople', 415), ('artists', 404), ('italian', 401), ('award', 399), ('politicians', 382), ('college', 378), ('roman', 376), ('activists', 360), ('catholics', 344), ('producers', 342), ('cup', 340), ('academy', 330), ('guitarists', 329), ('school', 327), 

In [87]:
name2id = {name:i for i, name in enumerate(list(tokenized_categories))}
id2name = {i:name for i, name in enumerate(list(tokenized_categories))}

In [178]:
def prepare_data(tokenized_categories, vocab):
    word2id = {w:i for i, w in enumerate(list(vocab))}
    data = np.zeros((len(tokenized_categories), len(vocab)))
    for i, name in enumerate(tokenized_categories):
        for w in tokenized_categories[name]:
            if w in word2id:
                data[name2id[name], word2id[w]] = 1
    return data

In [89]:
training_data = prepare_data(tokenized_categories, selected_words)

In [90]:
print(training_data.shape)

(9620, 1633)


In [114]:
def knn(one_hots, k=3):
    #pca = PCA(0.5)
    #one_hots = pca.fit_transform(one_hots)
    knn = NearestNeighbors(algorithm='kd_tree')
    knn.fit(one_hots)
    distances, indices = knn.kneighbors(one_hots, n_neighbors=k)
    zip_similar = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())))
    return np.array(zip_similar)

In [115]:
zip_knn = knn(training_data)

In [117]:
np.save('knn.npy', zip_knn)  

In [153]:
def get_closest_neighbors(knn_data):
    similar = pd.DataFrame([[name, ", ".join(map(str, saved_categories[name]))] for i, name in enumerate(list(tokenized_categories))])
    similar.columns = ['name', 'categories']
    
    similar['most similar'] = '0'
    similar['categories 1'] = '0'
    similar['distance 1'] = 0.000
    similar['second most similar'] = '0'
    similar['categories 2'] = '0'
    similar['distance 2'] = 0.000
    for i in range(len(tokenized_categories)):
        each = knn_data[i]
        name1 = id2name[each[0][1]]
        similar.at[i, 'most similar'] = name1
        similar.at[i, 'distance 1'] = each[1][1]
        similar['categories 1'] = ", ".join(map(str, saved_categories[name1]))
        name2 = id2name[each[0][2]]
        similar.at[i, 'second most similar'] = name2
        similar.at[i, 'distance 2'] = each[1][2]
        similar['categories 2'] = ", ".join(map(str, saved_categories[name2]))
    #similar.to_csv('similar.csv')
    return similar

In [154]:
similar = get_closest_neighbors(zip_knn)

In [155]:
similar.head(40)

Unnamed: 0,name,categories,most similar,categories 1,distance 1,second most similar,categories 2,distance 2
0,Louis Tomlinson,"1991 births, 21st-century English male actors,...",Jack Huston,"1987 births, 20th-century English male actors,...",3.0,Burak Deniz,"1972 births, 20th-century English male actors,...",3.0
1,Freddie Mercury,"1946 births, 1991 deaths, 20th-century British...",Seal (musician),"1987 births, 20th-century English male actors,...",3.741657,Akshay Kumar,"1972 births, 20th-century English male actors,...",3.872983
2,Elizabeth II,"1926 births, 20th-century British monarchs, 20...",Nigel Farage,"1987 births, 20th-century English male actors,...",2.645751,"Prince Philip, Duke of Edinburgh","1972 births, 20th-century English male actors,...",2.828427
3,Stephen Hawking,"1942 births, 2018 deaths, 20th-century English...",Jamal Khashoggi,"1987 births, 20th-century English male actors,...",3.316625,Vladimir Lenin,"1972 births, 20th-century English male actors,...",3.316625
4,Donald Trump,"1946 births, 2000 United States presidential c...",John Kasich,"1987 births, 20th-century English male actors,...",3.316625,Elizabeth Dole,"1972 births, 20th-century English male actors,...",3.464102
5,Cristiano Ronaldo,"1985 births, 2006 FIFA World Cup players, 2010...","Pepe (footballer, born 1983)","1987 births, 20th-century English male actors,...",1.732051,Gary Cahill,"1972 births, 20th-century English male actors,...",2.0
6,Cardi B,"1992 births, 21st-century American actresses, ...",Nicki Minaj,"1987 births, 20th-century English male actors,...",2.645751,Jennifer Lopez,"1972 births, 20th-century English male actors,...",2.645751
7,Elon Musk,"1971 births, 20th-century American businesspeo...",Israel Houghton,"1987 births, 20th-century English male actors,...",2.44949,Ellen Page,"1972 births, 20th-century English male actors,...",2.645751
8,XXXTentacion,"1998 births, 2018 deaths, 21st-century America...",Remy Ma,"1987 births, 20th-century English male actors,...",3.741657,Daddy Yankee,"1972 births, 20th-century English male actors,...",3.741657
9,Lionel Messi,"1987 births, 2006 FIFA World Cup players, 2007...",Javier Mascherano,"1987 births, 20th-century English male actors,...",2.44949,"Willian (footballer, born 1988)","1972 births, 20th-century English male actors,...",3.0


In [156]:
print(saved_categories['Ariana Grande'])
print(saved_categories['AnnaSophia Robb'])

['1993 births', '21st-century American actresses', '21st-century American singers', '21st-century women singers', 'Actresses from Florida', 'American child actresses', 'American contemporary R&B singers', 'American dance musicians']
['1993 births', '21st-century American actresses', '21st-century American singers', '21st-century women singers', 'Actresses from Denver', 'American child actresses', 'American child singers']


In [168]:
def count_category_frequency(categories):
    f = {}
    for name in categories:
        for c in categories[name]:
            if 'Articles' not in c and 'Wikidata' not in c and 'CS1' not in c and 'Pages' not in c and 'articles' not in c and 'pages' not in c:
                if c in f:
                    f[c] += 1
                else:
                    f[c] = 1
    return f

In [186]:
frequency = count_category_frequency(categories)

In [187]:
with open('category_frequency.json', 'w') as f10:
    json.dump(frequency, f10)

In [180]:
selected_categories = {key:val for key, val in frequency.items() if val > 2}

In [181]:
print(len(selected_categories))
print(sort_dict(selected_categories))

12918
3471
[('Living people', 1240), ('21st-century American male actors', 1127), ('21st-century American actresses', 1121), ('American male film actors', 1117), ('20th-century American male actors', 1046), ('American male television actors', 902), ('American film actresses', 861), ('20th-century American actresses', 853), ('21st-century American singers', 598), ('American television actresses', 536), ('21st-century women singers', 431), ('20th-century American singers', 389), ('21st-century American comedians', 373), ('American people of Irish descent', 344), ('American people of English descent', 318), ('American male voice actors', 297), ('American child actresses', 297), ('20th-century American comedians', 280), ('American Roman Catholics', 246), ('American people of German descent', 240), ('American male stage actors', 236), ('Infobox person using alma mater', 226), ('21st-century American rappers', 225), ('21st-century English male actors', 222), ('1986 births', 212), ('21st-cent

In [182]:
cat_data = prepare_data(categories, selected_categories)

In [183]:
zip_knn1 = knn(cat_data)

In [185]:
np.save('zip_knn_cat.npy',zip_knn1)

In [188]:
similar2 = get_closest_neighbors(zip_knn1)
similar2.head(40)

# not as accurate as that with tokenized categories

Unnamed: 0,name,categories,most similar,categories 1,distance 1,second most similar,categories 2,distance 2
0,Louis Tomlinson,"1991 births, 21st-century English male actors,...",Caster Semenya,"1944 births, 20th-century English musicians, 2...",1.732051,Grigor Dimitrov,"1972 births, 20th-century English male actors,...",1.732051
1,Freddie Mercury,"1946 births, 1991 deaths, 20th-century British...",Hassanal Bolkiah,"1944 births, 20th-century English musicians, 2...",2.0,Carl XVI Gustaf of Sweden,"1972 births, 20th-century English male actors,...",2.0
2,Elizabeth II,"1926 births, 20th-century British monarchs, 20...",Louis XIV of France,"1944 births, 20th-century English musicians, 2...",2.236068,Alauddin Khalji,"1972 births, 20th-century English male actors,...",2.236068
3,Stephen Hawking,"1942 births, 2018 deaths, 20th-century English...",Jeremy Clarkson,"1944 births, 20th-century English musicians, 2...",2.828427,Alexander the Great,"1972 births, 20th-century English male actors,...",2.828427
4,Donald Trump,"1946 births, 2000 United States presidential c...",Alexander the Great,"1944 births, 20th-century English musicians, 2...",2.828427,Elon Musk,"1972 births, 20th-century English male actors,...",2.828427
5,Cristiano Ronaldo,"1985 births, 2006 FIFA World Cup players, 2010...",Gary Cahill,"1944 births, 20th-century English musicians, 2...",1.732051,"Pepe (footballer, born 1983)","1972 births, 20th-century English male actors,...",1.732051
6,Cardi B,"1992 births, 21st-century American actresses, ...",Emmy Rossum,"1944 births, 20th-century English musicians, 2...",2.645751,Nicki Minaj,"1972 births, 20th-century English male actors,...",2.645751
7,Elon Musk,"1971 births, 20th-century American businesspeo...",Peter Madsen,"1944 births, 20th-century English musicians, 2...",1.414214,Luca Guadagnino,"1972 births, 20th-century English male actors,...",1.414214
8,XXXTentacion,"1998 births, 2018 deaths, 21st-century America...",Louis XIV of France,"1944 births, 20th-century English musicians, 2...",2.828427,Alauddin Khalji,"1972 births, 20th-century English male actors,...",2.828427
9,Lionel Messi,"1987 births, 2006 FIFA World Cup players, 2007...",Javier Mascherano,"1944 births, 20th-century English musicians, 2...",2.0,Edinson Cavani,"1972 births, 20th-century English male actors,...",2.236068
