### Get answer from DuckDuckGo


In [1]:
import requests
def query_duckduckgo(query, no_html=1, skip_disambig=1, **kwargs):
    params = {
        "q": query,
        "no_redirect": 1,
        "no_html": no_html,
        "skip_disambig": skip_disambig
    }
    params.update(kwargs)
    url = 'https://api.duckduckgo.com/'
    resp = requests.get(url, params=params)
    return resp.json()

In [2]:
info = query_duckduckgo("kool shen", format="json")
print(json.dumps(info, indent=4, sort_keys=True))

{
    "Abstract": "Bruno Lopes, alias Kool Shen, is a French rapper, actor and producer, with Portuguese origins. He is also a break dancer and a graffiti artist. He is a co-founder of Supr\u00eame NTM and one of the major figures of French rap. He was featured on Enhancer's album Electrochoc in the song \"Hot\". On 8 November 2009 it was announced that he was the first ever rapper to have an album featured on a football shirt, when Lyon accepted the deal with BetClic. Lyon will wear the shirt once, against Marseille.",
    "AbstractSource": "Wikipedia",
    "AbstractText": "Bruno Lopes, alias Kool Shen, is a French rapper, actor and producer, with Portuguese origins. He is also a break dancer and a graffiti artist. He is a co-founder of Supr\u00eame NTM and one of the major figures of French rap. He was featured on Enhancer's album Electrochoc in the song \"Hot\". On 8 November 2009 it was announced that he was the first ever rapper to have an album featured on a football shirt, when 

#### Get answers from duckduckgo for all (real, fake) names

In [3]:
import pandas as pd
import itertools

pseudonym_df = pd.read_csv("pseudonym.csv", names=["nickname"])
realname_df = pd.read_csv("real_name.csv", names=["realname"])
#print(pseudonym_df)
#print(realname_df)
pseudonym_names = pseudonym_df["nickname"].tolist()
real_names = realname_df["realname"].tolist()
all_names = list(itertools.chain(pseudonym_names, real_names))
print(all_names)
answers_from_duckduckgo = [query_duckduckgo(name, format="json") for name in all_names] 

['Booba', 'Doc Gyneco', 'Dr. Dre', 'Eminem', 'GZA', 'Jay-Z', 'Joey Starr', 'Kool Shen', 'Patrick Bruel', "Rockin' Squat", 'The  Rock', 'Andre Romelle Young', 'Bruno Beausir', 'Bruno Lopes', 'Didier Morville', 'Dwayne Johnson', 'Elie Yaffa', 'Gary Grice', 'Marshal Mathers', 'Mathias Crochon', 'Patrick Benguigui', 'Shawn Corey Carter']


In [4]:
import re
from unidecode import unidecode

TOKEN_PATTERN = r"([a-zA-Z0-9]+)"
TOKEN_RE = re.compile(TOKEN_PATTERN)
def regularize_name(name):
    name = unidecode(name)
    return " ".join([t.lower() for t in re.findall(TOKEN_RE, name)])
all_names = [regularize_name(n) for n in itertools.chain(pseudonym_names, real_names)]
print(all_names)

['booba', 'doc gyneco', 'dr dre', 'eminem', 'gza', 'jay z', 'joey starr', 'kool shen', 'patrick bruel', 'rockin squat', 'the rock', 'andre romelle young', 'bruno beausir', 'bruno lopes', 'didier morville', 'dwayne johnson', 'elie yaffa', 'gary grice', 'marshal mathers', 'mathias crochon', 'patrick benguigui', 'shawn corey carter']


In [5]:
print(regularize_name("Doc Gynéco"))
print(regularize_name("Jay_z"))
print(regularize_name("Jay-z"))

doc gyneco
jay z
jay z


### Feature engineering

#### Headings

In [6]:
for name, answer in zip(all_names, answers_from_duckduckgo):
    print(("%s --> %s" % (name, answer["Heading"])))

booba --> Booba
doc gyneco --> Doc Gynéco
dr dre --> Dr. Dre
eminem --> Eminem
gza --> GZA
jay z --> Jay-Z
joey starr --> Joeystarr
kool shen --> Kool Shen
patrick bruel --> Patrick Bruel
rockin squat --> Rockin' Squat
the rock --> The Rock
andre romelle young --> Dr. Dre
bruno beausir --> 
bruno lopes --> Bruno Lopes
didier morville --> Joeystarr
dwayne johnson --> Dwayne Johnson
elie yaffa --> 
gary grice --> 
marshal mathers --> 
mathias crochon --> 
patrick benguigui --> 
shawn corey carter --> Jay Z


We can see the (name, headings) give us information for some (real name, fake name) maps, 

if the headings of a real name or fake name are **the same**, it very likely they are a **match**.

Let's encoder this feature.

**Idea**:

Treat all headings (exclude these empty headings) as a categorical label using OneHotEncoder.

If a real name or a fake name maps to the **same headings**, the dot product of the onehot vector is 1, otherwise, the dot product should be zero

In [7]:
from sklearn.preprocessing import LabelBinarizer

headings = [regularize_name(h["Heading"]) for h in answers_from_duckduckgo]
heading_label_binarizer = LabelBinarizer()
heading_label_binarizer.fit(list(filter(lambda x: x, headings)))
headings_features = heading_label_binarizer.transform(headings)
print(headings_features.shape)
print(headings_features)
print(heading_label_binarizer.classes_)

(22, 13)
[[1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]]
['booba' 'bruno lopes' 'doc gyneco' 'dr dre' 'dwayne johnson' 'eminem'
 'gza' 'jay z' 'joeystarr' 'kool shen' 'patrick bruel' 'rockin squat'
 'the rock']


#### abstract url feature

In [8]:
for name, answer in zip(all_names, answers_from_duckduckgo):
    print(("%s --> %s" % (name, answer["AbstractURL"])))

booba --> https://en.wikipedia.org/wiki/Booba
doc gyneco --> https://en.wikipedia.org/wiki/Doc_Gyn%C3%A9co
dr dre --> https://en.wikipedia.org/wiki/Dr._Dre
eminem --> https://en.wikipedia.org/wiki/Eminem
gza --> https://en.wikipedia.org/wiki/GZA
jay z --> https://en.wikipedia.org/wiki/Jay-Z
joey starr --> https://en.wikipedia.org/wiki/Joeystarr
kool shen --> https://en.wikipedia.org/wiki/Kool_Shen
patrick bruel --> https://en.wikipedia.org/wiki/Patrick_Bruel
rockin squat --> https://en.wikipedia.org/wiki/Rockin'_Squat
the rock --> https://en.wikipedia.org/wiki/The_Rock
andre romelle young --> https://en.wikipedia.org/wiki/Dr._Dre
bruno beausir --> 
bruno lopes --> https://en.wikipedia.org/wiki/Bruno_Lopes
didier morville --> https://en.wikipedia.org/wiki/Joeystarr
dwayne johnson --> https://en.wikipedia.org/wiki/Dwayne_Johnson
elie yaffa --> 
gary grice --> 
marshal mathers --> 
mathias crochon --> 
patrick benguigui --> 
shawn corey carter --> https://en.wikipedia.org/wiki/Jay_Z


**Same idea as headings**,

if the abstract_urls are the same for a real name and a fake name, 
they are very likely a match

In [9]:
from sklearn.preprocessing import LabelBinarizer
from urllib.parse import unquote

abstract_urls = [unquote(h["AbstractURL"]) for h in answers_from_duckduckgo]
abstracturl_label_binarizer = LabelBinarizer()
abstracturl_label_binarizer.fit(list(filter(lambda x: x, abstract_urls)))
abstracturl_features = abstracturl_label_binarizer.transform(abstract_urls)
print(abstracturl_features.shape)
print(abstracturl_features)
print(abstracturl_label_binarizer.classes_)

(22, 14)
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0]]
['https://en.wikipedia.org/wiki/Booba'
 'https://en.wikipedia.org/wiki/Bruno_Lopes'
 'https://en.wikipedia.org/wiki/Doc_Gynéco'
 'https://en.wikipedia.org/wiki/Dr._Dre'
 'https://en.wikipedia.org/wiki/Dwayne_Johnson'
 'https://en.wikipedia.org/wiki/Eminem'
 'https://en.wikipedia.org/wiki/GZA' 'https://en.wi

#### Images

In [10]:
for name, answer in zip(all_names, answers_from_duckduckgo):
    print(("%s --> %s" % (name, answer["Image"])))

booba --> https://duckduckgo.com/i/0167cf4f.jpg
doc gyneco --> https://duckduckgo.com/i/d8964f42.jpg
dr dre --> https://duckduckgo.com/i/3862a395.jpg
eminem --> https://duckduckgo.com/i/eca98bbd.jpg
gza --> https://duckduckgo.com/i/cd060632.jpg
jay z --> https://duckduckgo.com/i/5fbc8a9e.png
joey starr --> https://duckduckgo.com/i/9c228ee7.jpg
kool shen --> 
patrick bruel --> https://duckduckgo.com/i/7e788845.jpg
rockin squat --> 
the rock --> 
andre romelle young --> https://duckduckgo.com/i/3862a395.jpg
bruno beausir --> 
bruno lopes --> 
didier morville --> https://duckduckgo.com/i/9c228ee7.jpg
dwayne johnson --> https://duckduckgo.com/i/639d3832.jpg
elie yaffa --> 
gary grice --> 
marshal mathers --> 
mathias crochon --> 
patrick benguigui --> 
shawn corey carter --> 


**Same idea as headings**

if the **image url** are the same for a real name and a fake name, they are very likely a match

In [11]:
image_urls = [unquote(h["Image"]) for h in answers_from_duckduckgo]
imageurl_label_binarizer = LabelBinarizer()
imageurl_label_binarizer.fit(list(filter(lambda x: x, image_urls)))
image_urls_features = imageurl_label_binarizer.transform(image_urls)
print(image_urls_features.shape)
print(image_urls_features)
print(imageurl_label_binarizer.classes_)

(22, 9)
[[1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]]
['https://duckduckgo.com/i/0167cf4f.jpg'
 'https://duckduckgo.com/i/3862a395.jpg'
 'https://duckduckgo.com/i/5fbc8a9e.png'
 'https://duckduckgo.com/i/639d3832.jpg'
 'https://duckduckgo.com/i/7e788845.jpg'
 'https://duckduckgo.com/i/9c228ee7.jpg'
 'https://duckduckgo.com/i/cd060632.jpg'
 'https://duckduckgo.com/i/d8964f42.jpg'
 'https://duckduckgo.com/i/eca98bbd.jpg']


**Thinking on this image features**

the image url feature we build above is very weak.

High chances that the image url of a match (fake name, real name) are not the same, we need to judge from the **image content**, not **image url**.

In that case, we could assume there is **a face** for every image, then we could calculate the **hog features vectors** for every images for similiarity checks

between faces, or, we could use **CNN** based networks to extract **face features** for use, but all these features are computing intensive, thus, we will ignore here

**let's see what we could achieve using the three features that we build: headings, abstract url, image_url**

In [12]:
import numpy as np

def get_similarity_scores(features_df, pseudonyms, realnames):
    pseudonyms = [regularize_name(x) for x in pseudonyms]
    realnames = [regularize_name(x) for x in realnames]
    score_matrix = np.array([np.dot(np.array(features_df.loc[x]), np.array(features_df.loc[y]))
                             for x, y in itertools.product(pseudonyms, realnames)])
    score_matrix = score_matrix.reshape(len(pseudonyms), len(realnames))
    score_df = pd.DataFrame(score_matrix, index=pseudonyms, columns=realnames)                                                                                                          
    #find the best match                                                                                                                                                                
    score_df["match"] = [score_df.loc[name].idxmax() for name in score_df.index]                                                                                                       
    return score_df

features = np.concatenate((headings_features, abstracturl_features, image_urls_features), axis=1)
features_df = pd.DataFrame.from_records(features, index=all_names)
scores_df = get_similarity_scores(features_df, pseudonym_names, real_names)
scores_df

Unnamed: 0,andre romelle young,bruno beausir,bruno lopes,didier morville,dwayne johnson,elie yaffa,gary grice,marshal mathers,mathias crochon,patrick benguigui,shawn corey carter,match
booba,0,0,0,0,0,0,0,0,0,0,0,andre romelle young
doc gyneco,0,0,0,0,0,0,0,0,0,0,0,andre romelle young
dr dre,3,0,0,0,0,0,0,0,0,0,0,andre romelle young
eminem,0,0,0,0,0,0,0,0,0,0,0,andre romelle young
gza,0,0,0,0,0,0,0,0,0,0,0,andre romelle young
jay z,0,0,0,0,0,0,0,0,0,0,1,shawn corey carter
joey starr,0,0,0,3,0,0,0,0,0,0,0,didier morville
kool shen,0,0,0,0,0,0,0,0,0,0,0,andre romelle young
patrick bruel,0,0,0,0,0,0,0,0,0,0,0,andre romelle young
rockin squat,0,0,0,0,0,0,0,0,0,0,0,andre romelle young


as we can see, using the three week features we build above, we could only found **three matches**

we need more features

#### Abstract

In [13]:
for name, answer in zip(all_names, answers_from_duckduckgo):
    print(("%s --> %s" % (name, answer["Abstract"])))

booba --> Elie Yaffa, better known under his stage name Booba is a French rapper. After a brief stint as a break dancer in the early 1990s, Booba partnered with his friend Ali to form Lunatic. The duo released a critically acclaimed album in 2000 but disbanded in 2003. Booba has since embarked on a successful solo career, selling more than 10 million discs over his career and becoming the most legally downloaded artist in French history. Booba is praised for the quality of his flow and beats but often criticized because of controversial nature of his lyrics. He has also established the rap label Tallac Records, and developed a line of jewellery.
doc gyneco --> Doc Gynéco is the stage name of Bruno Beausir, a French hip hop musician. His music is typically characterized as a ragga/rap style, that has found its fan base in France.
dr dre --> Andre Romelle Young, better known by his stage name Dr. Dre, is an American rapper, record producer, and entrepreneur. He is the founder and current

**Idea**
_______________________

If a real name and a fake name appears in **the same abstract**, there are likely a match.

We could build a **tfidf matrix** from the abstract corpus, then, for a chosen real/fake name, we sum the tfidf vector of its composing words for every abstract as its feature vector.

If a real/fake names **co-exists** in one or more abstract, the dot product should be positive. High dot product means high probability of match.


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


def process_text_features(names, text_corpus):                                                                                                                                              
    vectorizer  = CountVectorizer(strip_accents="unicode", analyzer="word",                                                                                                                 
            token_pattern = TOKEN_PATTERN, lowercase=True)
    
    transformer = TfidfTransformer()                                                                                                                                                        
    tfidf = transformer.fit_transform(vectorizer.fit_transform(text_corpus))
                                                                                                                                                                                            
    feature_dim      = tfidf.shape[0]                                                                                                                                                       
    vocalbulary_size = tfidf.shape[1]                                                                                                                                                       
                                                                                                                                                                                            
    vocalbulary_map  = vectorizer.vocabulary_                                                                                                                                              
                                                                                                                                                                                            
    def get_coexistence_feature_for_a_single_name(names):                                                                                                                                   
        feature_vec = np.zeros((1, feature_dim))                                                                                                                                            
        tokens = names.split(" ")                                                                                                                                                           
        for token in tokens:
            #print(token)
            idx = vocalbulary_map.get(token, None)                                                                                                                                          
            if idx is not None:
                feature_vec += tfidf[:, idx].toarray().reshape(1, feature_dim)
        return feature_vec
                                                                                                                                                                                            
    features = []                                                                                                                                                                           
    for name in names:                                                                                                                                                                      
        features.append(get_coexistence_feature_for_a_single_name(name))                                                                                                                    
                                                                                                                                                                                            
    return np.array(features).reshape(len(names), feature_dim)

abstracts = [answer["Abstract"] for answer in answers_from_duckduckgo]
abstract_features = process_text_features(all_names, abstracts)

In [15]:
abstract_features.shape

(22, 22)

In [16]:
features = np.concatenate((headings_features, abstracturl_features, image_urls_features, abstract_features), axis=1)
features_df = pd.DataFrame.from_records(features, index=all_names)
scores_df = get_similarity_scores(features_df, pseudonym_names, real_names)
scores_df

Unnamed: 0,andre romelle young,bruno beausir,bruno lopes,didier morville,dwayne johnson,elie yaffa,gary grice,marshal mathers,mathias crochon,patrick benguigui,shawn corey carter,match
booba,0.0,0.0,0.0,0.0,0.0,0.068162,0.0,0.0,0.0,0.0,0.0,elie yaffa
doc gyneco,0.0,0.18577,0.087073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bruno beausir
dr dre,3.121587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,andre romelle young
eminem,0.03669,0.0,0.0,0.0,0.0,0.0,0.0,0.08269,0.0,0.0,0.0,marshal mathers
gza,0.0,0.0,0.0,0.0,0.0,0.0,0.047581,0.0,0.0,0.0,0.0,gary grice
jay z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.051789,shawn corey carter
joey starr,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,didier morville
kool shen,0.0,0.015889,0.0339,0.199363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,didier morville
patrick bruel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274079,0.0,patrick benguigui
rockin squat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051133,0.0,0.0,mathias crochon



#### Infobox

In [17]:
def process_info_box(infobox):
    if isinstance(infobox, str):
        return infobox
    else:
        values_list = infobox.get("content", "")
        content_corpus = ",".join(map(lambda x: x["value"], values_list[0:-1]))
        return content_corpus
    
for name, answer in zip(all_names, answers_from_duckduckgo):
    print(("%s --> %s" % (name, process_info_box(answer["Infobox"]))))

booba --> Élie Yaffa,Dec 9, 1976,Boulogne-Billancourt, France,French rap, gangsta rap, trap,Rapper,1994–present,Tallac, Universal, Barclay, Because, 45 Scientific, Musicast l'Autreprod,Ali, Diddy, Ryan Leslie, Akon, Bushido, T-Pain, Nessbeal, La Fouine, 92I, Tony Parker, 113, Rock City, Alborosie, 2 Chainz, Rick Ross, Kaaris, Lacrim,booba
doc gyneco --> 
dr dre --> Andre Romelle Young, Feb 18, 1965, Compton, California, U.S.,Rapper, songwriter, record producer, entrepreneur,1984–present,$740 million,Warren G (step-brother),[drdre.com],drdre,DrDre,nm0236564
eminem --> Marshall Bruce Mathers III, Oct 17, 1972, St. Joseph, Missouri, U.S.,Rochester Hills, Michigan, U.S.,Double M, M&M,Rapper, record producer, songwriter, actor,1988–present,3,eminem,eminem,eminem,UCfM3zsQsOnfWNUppiycmBuw,nm0004896,7dGJo4pcD2V6oG8kP0tJRR,eminemofficial,As2n6c7fmjt6fyysxuwgjgynzp4,111051
gza --> Gary Grice,The Genius,Aug 22, 1966, Brooklyn, New York, U.S.,Hip hop,Rapper, songwriter,1987–present,Cold Chillin', 

**born** and **birth name** in infobox content is a good feature

if a real/fake appears in the **two sides** of (name, born and birth name), we are almost certain that it is a match

In [18]:
def get_infobox_key_info(infobox, key):
    if isinstance(infobox, str):
        return infobox
    else:
        values_list = infobox.get("content", None)
        if values_list:
            for value in values_list:
                if value["label"] == key:
                    return value["value"]
        return ""
    
for name, answer in zip(all_names, answers_from_duckduckgo):
    print(("%s --> %s" % (name, get_infobox_key_info(answer["Infobox"], "Born"))))
    print(("%s --> %s" % (name, get_infobox_key_info(answer["Infobox"], "Birth name"))))

booba --> Dec 9, 1976
booba --> Élie Yaffa
doc gyneco --> 
doc gyneco --> 
dr dre --> Andre Romelle Young, Feb 18, 1965, Compton, California, U.S.
dr dre --> 
eminem --> Marshall Bruce Mathers III, Oct 17, 1972, St. Joseph, Missouri, U.S.
eminem --> 
gza --> Aug 22, 1966, Brooklyn, New York, U.S.
gza --> Gary Grice
jay z --> Shawn Corey Carter, Dec 4, 1969, New York City, New York, U.S.
jay z --> 
joey starr --> Oct 27, 1967, Saint-Denis, Seine-Saint-Denis, France
joey starr --> Didier Morville
kool shen --> Bruno Lopes, February 9, 1966, Saint-Denis, Paris, France
kool shen --> 
patrick bruel --> May 14, 1959, Tlemcen, French Algeria
patrick bruel --> 
rockin squat --> 
rockin squat --> 
the rock --> 
the rock --> 
andre romelle young --> Andre Romelle Young, Feb 18, 1965, Compton, California, U.S.
andre romelle young --> 
bruno beausir --> 
bruno beausir --> 
bruno lopes --> 
bruno lopes --> 
didier morville --> Oct 27, 1967, Saint-Denis, Seine-Saint-Denis, France
didier morville -->

In [19]:
#print(all_names)
all_names_regex = r"(" + "|".join(all_names) + r")"
#print(all_names_regex)
all_names_pat = re.compile(all_names_regex)
all_names_map = dict([(name, idx) for idx, name in enumerate(all_names)])
#print(all_names_map)
born_or_birthname_features = np.identity((len(all_names)))

for name, answer in zip(all_names, answers_from_duckduckgo):
    born = regularize_name(get_infobox_key_info(answer["Infobox"], "Born"))
    born_names = all_names_pat.findall(born)
    if len(born_names) != 0:
        #print(born_names[0])
        born_or_birthname_features[all_names_map[name], all_names_map[born_names[0]]] += 1  
        
    birth_name  = regularize_name(get_infobox_key_info(answer["Infobox"], "Birth name"))
    birth_names = all_names_pat.findall(birth_name)
    if len(birth_names) != 0:
        #print(birth_names[0])
        born_or_birthname_features[all_names_map[name], all_names_map[birth_names[0]]] += 1

print(all_names)
#print(born_or_birthname_features)
print(born_or_birthname_features.shape)
print(born_or_birthname_features[7,:])
print(born_or_birthname_features[13,:])

['booba', 'doc gyneco', 'dr dre', 'eminem', 'gza', 'jay z', 'joey starr', 'kool shen', 'patrick bruel', 'rockin squat', 'the rock', 'andre romelle young', 'bruno beausir', 'bruno lopes', 'didier morville', 'dwayne johnson', 'elie yaffa', 'gary grice', 'marshal mathers', 'mathias crochon', 'patrick benguigui', 'shawn corey carter']
(22, 22)
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [21]:
features = np.concatenate((headings_features, abstracturl_features, image_urls_features, abstract_features, born_or_birthname_features), axis=1)
features_df = pd.DataFrame.from_records(features, index=all_names)
scores_df = get_similarity_scores(features_df, pseudonym_names, real_names)
scores_df

Unnamed: 0,andre romelle young,bruno beausir,bruno lopes,didier morville,dwayne johnson,elie yaffa,gary grice,marshal mathers,mathias crochon,patrick benguigui,shawn corey carter,match
booba,0.0,0.0,0.0,0.0,0.0,1.068162,0.0,0.0,0.0,0.0,0.0,elie yaffa
doc gyneco,0.0,0.18577,0.087073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bruno beausir
dr dre,5.121587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,andre romelle young
eminem,0.03669,0.0,0.0,0.0,0.0,0.0,0.0,0.08269,0.0,0.0,0.0,marshal mathers
gza,0.0,0.0,0.0,0.0,0.0,0.0,1.047581,0.0,0.0,0.0,0.0,gary grice
jay z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.051789,shawn corey carter
joey starr,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,didier morville
kool shen,0.0,0.015889,1.0339,0.199363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bruno lopes
patrick bruel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274079,0.0,patrick benguigui
rockin squat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051133,0.0,0.0,mathias crochon
