In [36]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from datetime import date
import re
from datetime import datetime

# Entity Resolution Problem Statement

Journalists, academics, and businesses work hard to get big masses of data to learn about what people or organizations are doing. Unfortunately, once we get the data, we often can’t answer our questions because we can’t tell who is who.

In much real-world data, we do not have a way of absolutely deciding whether two records, say David Guy-Brizan and D Guy-Brizan are referring to the same person. If these were records of Professors at the University of San Francisco data, did a David Guy-Brizan give two lectures on two separate topics or did Desmond Guy-Brizan give the lecture on the second topic? Perhaps it could even be on completely separate topics.

People are pretty good at making these calls, if they have enough information. For example, I would be pretty confident that the following two records are the about the same person.

|first name | last name | Topic                   | hours   |
| --- | --- | --- |
|David      | Guy-Brizan   | Machine Learning   | 2pm - 4pm Thurs |
|D          | Guy-Brizan   | Algorithms |   2pm - 4pm Tuesdays|

If we have to decide which records in our data are about the same person or organization, then we could just go through by hand, compare every record, and decide which records are about the same entity.

This is very, very boring and can takes a long time. 

# Exploring the data & Preprocessing

### We can do a combination of the following for data normalization: 
1. Convert all entities to upper or lower case, and remove whitespace
2. Run a spell checker to remove known typographical errors 
3. Replace nicknames, and expand abbreviations 
4. Perform looksups in lexicons 
5. Tokenize, Stem, or Lemmatize words 

### We can do a combination of the following for missing values: 
1. Set to Nan, Null, or remove 
2. Missing entries can also be filled by aggregating other fields or taking means 

In [37]:
# Read in the data
entities = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/named_resolution.csv")
articles = pd.read_json("/Users/Rong/Documents/USF/EntityResolution/Model1/articles.json")


In [38]:
# View data head

print(entities.head(5))
# select only relevant columns 
print(entities.head()[['article_id','name','type','paragraph']])

         id  article_id                               model         name      type  entity_id  paragraph  sentence
0  27303856  331842      english.all.3class.distsim.crf.ser  Broadway     LOCATION  2551       1          1       
1  27303857  331842      english.all.3class.distsim.crf.ser  Daniel Fish  PERSON    85567      1          2       
2  27303858  331842      english.all.3class.distsim.crf.ser  Rodgers      PERSON    70833      1          2       
3  27303859  331842      english.all.3class.distsim.crf.ser  Hammerstein  PERSON    98182      1          2       
4  27303860  331842      english.all.3class.distsim.crf.ser  Oklahoma     LOCATION  1332       1          2       
   article_id         name      type  paragraph
0  331842      Broadway     LOCATION  1        
1  331842      Daniel Fish  PERSON    1        
2  331842      Rodgers      PERSON    1        
3  331842      Hammerstein  PERSON    1        
4  331842      Oklahoma     LOCATION  1        


In [39]:
# print(articles.head(5)[['content', 'id']])

# Get all entities that are type PEOPLE only 
entity_people = entities[entities['type'] == 'PERSON']
print(entity_people.head(10)[['article_id','name','type','paragraph']])

# Capitalise all names and remove spaces and non alphabetical characters
entity_people_names_CAPITALS = entity_people['name'].str.upper().str.replace('\W+', '')
entity_people_names_CAPITALS.head(10)

    article_id                  name    type  paragraph
1   331842      Daniel Fish           PERSON  1        
2   331842      Rodgers               PERSON  1        
3   331842      Hammerstein           PERSON  1        
6   331842      Damon Daunno          PERSON  3        
9   331842      Fish                  PERSON  6        
11  331842      Oscar Hammerstein II  PERSON  7        
12  331842      Fish                  PERSON  8        
13  331842      Trevor Nunn           PERSON  8        
14  331842      Susan Stroman         PERSON  8        
17  331842      Lynn Riggs            PERSON  9        


1     DANIELFISH        
2     RODGERS           
3     HAMMERSTEIN       
6     DAMONDAUNNO       
9     FISH              
11    OSCARHAMMERSTEINII
12    FISH              
13    TREVORNUNN        
14    SUSANSTROMAN      
17    LYNNRIGGS         
Name: name, dtype: object

In [40]:
# check to see if we have duplicate Names
entity_people_names_CAPITALS.duplicated().head(20)  

## YES, WE DO!

1     False
2     False
3     False
6     False
9     False
11    False
12    True 
13    False
14    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
Name: name, dtype: bool

# Libraries for distributed representations of words 

There are several methods to compute the DRs of words: 
1. word2Vec https://github.com/maxoodf/word2vec
2. GloVe https://nlp.stanford.edu/projects/glove/
3. fastText https://fasttext.cc/




# Model 0 - Clustering entities with GloVe

#### GloVe = Global Vectors for Word Representation

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.




#### Inputs: Entity 1, Context String Entity 1, Entity 2, Context String Entity 2 
#### Outputs: True/False 


#### Preprocessing 
Convert all Entities into lower or upper case format
Select only entities that are categorised as 'People'

#### Clustering
for each (Entity + Context String) in the dataset 
        if we find a match 
            add it to the Map 
        else 
            create new entry in the Map 
        
#### Post Processing 
Select the the entity representative of each bucket

    

# FuzzyWuzzy Playground
Fuzzy string matchung uses Levenshtein Distance to calculate the differences between sequences in a simple-to-use package.



In [41]:
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process


In [42]:
# for each name, check for similar entries
def get_ratio(row): 
    name = fuzz.token_set_ratio(entity_people['name'], row['name'])
#    print("one", row['name'])
 #   print(name)
  #  print(entity_people['name'])
    entity_type = fuzz.token_set_ratio(entity_people['type'], row['type'])
    article_id = fuzz.token_set_ratio(entity_people['article_id'], row['article_id'])
    
#     weights = [8.0, 2.0]
        
#     return np.average(np.array([name, entity_type]), weights=weights)
    return np.average(np.array([name, entity_type, article_id]))

In [43]:
# add score column 
entity_people['SCORE'] = entities.apply(get_ratio, axis=1)

entity_people.head()

# What does this score tell us?


Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1,27303857,331842,english.all.3class.distsim.crf.ser,Daniel Fish,PERSON,85567,1,2,100.0
2,27303858,331842,english.all.3class.distsim.crf.ser,Rodgers,PERSON,70833,1,2,100.0
3,27303859,331842,english.all.3class.distsim.crf.ser,Hammerstein,PERSON,98182,1,2,100.0
6,27303863,331842,english.all.3class.distsim.crf.ser,Damon Daunno,PERSON,546335,3,1,100.0
9,27303870,331842,english.all.3class.distsim.crf.ser,Fish,PERSON,85567,6,2,100.0


In [44]:

# Convert all entities to Upper case and remove white space
entity_people = entity_people.assign(name=entities['name'].str.upper().str.replace('\W+', ''))

# Sort scores in descending order
entity_people.loc[entities['type'] == 'PERSON'].sort_values('SCORE',ascending=True).head(5)
# entity_people.loc[entities['type']=='PERSON'].sort_values('SCORE',ascending=False).head(70)


Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1089,45270731,423929,english.all.3class.distsim.crf.ser,DOUBT,PERSON,886957,12,1,35.0
1059,45269526,423928,english.all.3class.distsim.crf.ser,RIVAS,PERSON,886939,9,1,35.0
2035,45386277,424545,english.all.3class.distsim.crf.ser,ABRAMS,PERSON,888323,5,1,35.0
2042,45041895,423294,english.all.3class.distsim.crf.ser,OBAMA,PERSON,319,3,2,35.0
1022,45272338,423931,english.all.3class.distsim.crf.ser,RIVERA,PERSON,178524,7,1,35.0


In [45]:
process.extract("DONALDTRUMP", entity_people.name, scorer=fuzz.token_sort_ratio)

[('DONALDTRUMP', 100, 528),
 ('DONALDTRUMP', 100, 864),
 ('DONALDTRUMP', 100, 1112),
 ('DONALDTRUMP', 100, 2104),
 ('DONALDJTRUMP', 96, 1893)]

# Applying Dedupe

In [46]:
entities.head()

# Get all data related to people only
entity_people = entities[entities['type'] == 'PERSON']

# print(entity_people[entity_people.name == 'HAMMERSTEIN'])
entity_people.head()

Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence
1,27303857,331842,english.all.3class.distsim.crf.ser,Daniel Fish,PERSON,85567,1,2
2,27303858,331842,english.all.3class.distsim.crf.ser,Rodgers,PERSON,70833,1,2
3,27303859,331842,english.all.3class.distsim.crf.ser,Hammerstein,PERSON,98182,1,2
6,27303863,331842,english.all.3class.distsim.crf.ser,Damon Daunno,PERSON,546335,3,1
9,27303870,331842,english.all.3class.distsim.crf.ser,Fish,PERSON,85567,6,2


In [47]:
print("Shape of entities : {}".format(entities.shape))
print("Shape of entities : {}".format(entity_people.shape))

# Here we see that out of the 2391 data entries that we have, 1222 of those are classified as a PERSON.
# For our first model, we will use the dedupe library on type=PERSON


Shape of entities : (2391, 8)
Shape of entities : (1222, 8)


In [48]:
import logging; logging.disable(level=logging.NOTSET)

# Entity Resolution with Dedupe.io

Dedupe is a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.

Dedupe will help us remove duplicate entries from our people dataset of name entities, whilst taking into account the paragraph information which provides context to the named entities. 

Dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases.


### Variable Types
A variable definition describes the records that you want to match. It is a dictionary where the keys are the fields and the values are the field specification. For our example, we will use the following:

#### String Types
String types are compared using affine gap string distance.

#### Text Types
Fields containing long blocks of text e.g. product descriptions or article abstracts we use the Text type fields. These are compared using the cosine similarity metric.

This is a measurement of the amount of words that two documents have in common. This measure can be made more useful as the overlap of rare words counts more than the overlap of common words.

If provided a sequence of example fields (i.e. a corpus) then dedupe will learn these weights for you.

#### Name Types
A Name variable should be used for a field that contains American names, corporations and households. It uses the probablepeople package to split apart an name string into components like give name, surname, generational suffix, for people names, and abbreviation, company type, and legal form for corporations.

## Building blocks


#### Record similarity
We make the assumpiton that records that are more similar are more likely to be duplicates. 

The default way that this is done in Dedupe is to use what’s called a string metric. A string metric is a way of taking two strings and returning a number that is low if the strings are similar and high if they are dissimilar. 

There are lots of different string metrics, and we actually use a metric called the Affine Gap Distance. It counts the number of substitutions that must be made to turn one string into another. It is similar to our familiar Levenshtein distance, that is used in Fuzzy Wuzzy.


#### Regularized logistic regression
If we supply pairs of records that we label as either being duplicates or distinct, then Dedupe will learn a set of weights such that the record distance can easily be transformed into our best estimate of the probability that a pair of records are duplicates.

Once we have learned these good weights, we want to use them to find which records are duplicates. But turns out that doing this the naive way will usually not work, and we’ll have to do something smarter.


#### Active Learning
In order to learn those weights, Dedupe needs example pairs with labels. Most of the time, we will need to supply those labels.

But the whole point of Dedupe is to save people’s time, and that includes making good use of your labeling time so we use an approach called Active Learning.

To do this, we maintain a set of the pairs where there is disagreement: that is pairs which classifier believes are duplicates but which are not covered by the current blocking rules, and the pairs which the classifier believes are distinct but which are blocked together.

Dedupe picks, at random from this disagreement set, a pair of records and asks the user to decide. Once it gets this label, it relearns the weights and blocking rules. We then recalculate the disagreement set.






# Model 1 - Dedupe.io with name, article_id, and abs_paragraph

In [49]:
fields = [
{
"crf": True,
"type": "Name",
"field": "abs_name",
"log file": "/tmp/name.csv"
},
{
"type": "PositiveNumber",
"field": "abs_article_id"
},
{
"type": "PositiveNumber",
"field": "abs_paragraph"
}

]

# Our results for Model 1


In [50]:
deduped_people = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/DedupedPersons.csv")

grouped_clusterId = deduped_people.groupby('cluster_id')
grouped_name = deduped_people.groupby('name')

grouped_name.get_group('Trump')

Unnamed: 0,cluster_id,id,article_id,model,name,type,entity_id,paragraph,sentence
373,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45273599,423932,english.all.3class.distsim.crf.ser,Trump,PERSON,167,5,2
374,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45289264,424080,english.all.3class.distsim.crf.ser,Trump,PERSON,167,13,3
375,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45289328,424080,english.all.3class.distsim.crf.ser,Trump,PERSON,167,27,3
376,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45289377,424080,english.all.3class.distsim.crf.ser,Trump,PERSON,167,38,2
377,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,31202403,320418,english.all.3class.distsim.crf.ser,Trump,PERSON,167,4,1
378,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45345139,424388,english.all.3class.distsim.crf.ser,Trump,PERSON,167,7,1
379,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45349381,424392,english.all.3class.distsim.crf.ser,Trump,PERSON,167,2,2
380,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45349405,424392,english.all.3class.distsim.crf.ser,Trump,PERSON,167,8,2
381,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45350342,424393,english.all.3class.distsim.crf.ser,Trump,PERSON,167,1,1
382,63758edf-0cdc-4c6e-bc20-ec1823f54dfb,45350353,424393,english.all.3class.distsim.crf.ser,Trump,PERSON,167,3,1


In [51]:
# for each name calculate the number of errors in each of its groupings 

grouped_name.head()

Unnamed: 0,cluster_id,id,article_id,model,name,type,entity_id,paragraph,sentence
0,0074c072-db24-469b-ae55-cd37b0637e14,31202397,320418,english.all.3class.distsim.crf.ser,Miriam Buether,PERSON,763107,3,2
1,02146eca-2281-4d98-b7f7-3838e3eb08f7,45286564,424079,english.all.3class.distsim.crf.ser,John Durham,PERSON,39439,3,1
2,02146eca-2281-4d98-b7f7-3838e3eb08f7,45295121,424082,english.all.3class.distsim.crf.ser,Connecticut John Durham,PERSON,828645,2,1
3,02146eca-2281-4d98-b7f7-3838e3eb08f7,45286592,424079,english.all.3class.distsim.crf.ser,Durham,PERSON,39439,8,1
4,02146eca-2281-4d98-b7f7-3838e3eb08f7,45286582,424079,english.all.3class.distsim.crf.ser,John Durham,PERSON,39439,6,1
5,02146eca-2281-4d98-b7f7-3838e3eb08f7,45286560,424079,english.all.3class.distsim.crf.ser,John Durham,PERSON,39439,2,1
6,031da3d0-179b-43be-a47f-d73dd6d2367f,45117349,423455,english.all.3class.distsim.crf.ser,Jackie,PERSON,4462,3,1
7,054341de-275b-4c30-b6a0-9d3c262bdbb7,45295149,424082,english.all.3class.distsim.crf.ser,Tucker Carlson,PERSON,5097,5,1
8,054341de-275b-4c30-b6a0-9d3c262bdbb7,45295118,424082,english.all.3class.distsim.crf.ser,Tucker Carlson,PERSON,5097,2,1
9,0547d4c7-8c90-477b-bbac-be43c60ad5fd,45158684,423617,english.all.3class.distsim.crf.ser,Holly Metcalf Kinyon,PERSON,885179,3,2


# Model 2 - Dedupe.io with the associated Article as input

For our second model, we include the article content associated with the entity to give us greater context when applying Machine Learning.

In [101]:

articles.head(10)[['content','description','id']]
# print(articles.head())

# create dictionary

article_id = {}

count = 0
duplicate = 0
for index, row in articles.iterrows():
        if row['id'] not in article_id:
            article_id[row['id']] = row['content']
            count = count + 1
        else:
            duplicate = duplicate + 1
        

print(count)        
print(duplicate)
len(article_id)


100
0


100

In [102]:
entity_people.head()

Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,content
1,27303857,331842,english.all.3class.distsim.crf.ser,Daniel Fish,PERSON,85567,1,2,
2,27303858,331842,english.all.3class.distsim.crf.ser,Rodgers,PERSON,70833,1,2,
3,27303859,331842,english.all.3class.distsim.crf.ser,Hammerstein,PERSON,98182,1,2,
6,27303863,331842,english.all.3class.distsim.crf.ser,Damon Daunno,PERSON,546335,3,1,
9,27303870,331842,english.all.3class.distsim.crf.ser,Fish,PERSON,85567,6,2,


In [140]:
entity_people['content'] = ""
# print(article_id.keys())

for index, row in entity_people.iterrows():
    a_id = row['article_id']
#     print(row[index], article_id.get(a_id))
    entity_people.at[index, 'content'] = article_id.get(a_id)
#     row['content'][index] =     

# entity_people.head()

In [142]:
# entity_people.head()

In [144]:
entity_people.to_csv(r'/Users/Rong/Documents/USF/EntityResolution/Model1/people_articles.csv', index = False)

In [148]:
print(entity_people.shape)

(1222, 9)


In [151]:
people_articles = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/people_articles.csv")


## Applying Dedupe

In [152]:
fields = [
{
"type": "Name",
"field": "name",
},
{
"type": "PositiveNumber",
"field": "article_id"
},
{
"type": "PositiveNumber",
"field": "paragraph"
},
{
"type": "PositiveNumber",
"field": "sentence"
},
{
"type": "Text",
"field": "content"
}
]