In [208]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from datetime import date
import re
from datetime import datetime

# Exploring the data & Preprocessing

### We can do a combination of the following for data normalization: 
1. Convert all entities to upper or lower case, and remove whitespace
2. Run a spell checker to remove known typographical errors 
3. Replace nicknames, and expand abbreviations 
4. Perform looksups in lexicons 
5. Tokenize, Stem, or Lemmatize words 

### We can do a combination of the following for missing values: 
1. Set to Nan, Null, or remove 
2. Missing entries can also be filled by aggregating other fields or taking means 

In [209]:
# Read in the data
entities = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/named_resolution.csv")
articles = pd.read_json("/Users/Rong/Documents/USF/EntityResolution/Model1/articles.json")


In [210]:
# View data head
print(entities.head(5))
# select only relevant columns 
print(entities.head()[['article_id','name','type','paragraph']])

         id  article_id                               model         name      type  entity_id  paragraph  sentence
0  27303856  331842      english.all.3class.distsim.crf.ser  Broadway     LOCATION  2551       1          1       
1  27303857  331842      english.all.3class.distsim.crf.ser  Daniel Fish  PERSON    85567      1          2       
2  27303858  331842      english.all.3class.distsim.crf.ser  Rodgers      PERSON    70833      1          2       
3  27303859  331842      english.all.3class.distsim.crf.ser  Hammerstein  PERSON    98182      1          2       
4  27303860  331842      english.all.3class.distsim.crf.ser  Oklahoma     LOCATION  1332       1          2       
   article_id         name      type  paragraph
0  331842      Broadway     LOCATION  1        
1  331842      Daniel Fish  PERSON    1        
2  331842      Rodgers      PERSON    1        
3  331842      Hammerstein  PERSON    1        
4  331842      Oklahoma     LOCATION  1        


In [211]:
# print(articles.head(5)[['content', 'id']])


# Get all entities that are type PEOPLE only 
entity_people = entities[entities['type'] == 'PERSON']
print(entity_people.head(10)[['article_id','name','type','paragraph']])

# Capitalise all names and remove spaces and non alphabetical characters
entity_people_names_CAPITALS = entity_people['name'].str.upper().str.replace('\W+', '')
entity_people_names_CAPITALS.head(10)

    article_id                  name    type  paragraph
1   331842      Daniel Fish           PERSON  1        
2   331842      Rodgers               PERSON  1        
3   331842      Hammerstein           PERSON  1        
6   331842      Damon Daunno          PERSON  3        
9   331842      Fish                  PERSON  6        
11  331842      Oscar Hammerstein II  PERSON  7        
12  331842      Fish                  PERSON  8        
13  331842      Trevor Nunn           PERSON  8        
14  331842      Susan Stroman         PERSON  8        
17  331842      Lynn Riggs            PERSON  9        


1     DANIELFISH        
2     RODGERS           
3     HAMMERSTEIN       
6     DAMONDAUNNO       
9     FISH              
11    OSCARHAMMERSTEINII
12    FISH              
13    TREVORNUNN        
14    SUSANSTROMAN      
17    LYNNRIGGS         
Name: name, dtype: object

In [212]:
# check to see if we have duplicate Names
entity_people_names_CAPITALS.duplicated().head(20)  

## YES, WE DO !

1     False
2     False
3     False
6     False
9     False
11    False
12    True 
13    False
14    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
Name: name, dtype: bool

# Libraries for distributed representations of words 

There are several methods to compute the DRs of words: 
1. word2Vec https://github.com/maxoodf/word2vec
2. GloVe https://nlp.stanford.edu/projects/glove/
3. fastText https://fasttext.cc/




# Model 1 - Clustering entities with GloVe

#### GloVe = Global Vectors for Word Representation

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.




#### Inputs: Entity 1, Context String Entity 1, Entity 2, Context String Entity 2 
#### Outputs: True/False 


#### Preprocessing 
Convert all Entities into lower or upper case format
Select only entities that are categorised as 'People'

#### Clustering
for each (Entity + Context String) in the dataset 
        if we find a match 
            add it to the Map 
        else 
            create new entry in the Map 
        
#### Post Processing 
Select the the entity representative of each bucket

    

# Model 2 - Entity Resolution with Dedupe



In [244]:
entities.head()

entity_people = entities[entities['type'] == 'PERSON']


In [248]:
print("Shape of entities : {}".format(entities.shape))
print("Shape of entities : {}".format(entity_people.shape))


Shape of entities : (2391, 9)
Shape of entities : (1222, 9)


### FuzzyWuzzy
Fuzzy string matching like a boss. It uses Levenshtein Distance to calculate the differences between sequences in a simple-to-use package.

In [249]:
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

In [256]:
#check for similar entries
def get_ratio(row): 
    name         = fuzz.token_set_ratio(entity_people['name'], row['name'])
    entity_type     = fuzz.token_set_ratio(entity_people['type'], row['type'])
    article_id = fuzz.token_set_ratio(entity_people['article_id'], row['article_id'])
    
    weights = [8.0, 0.0, 2.0]
    
    print(np.array([name, entity_type, article_id]))
    
    return np.average(np.array([name, entity_type, article_id]), weights=weights)

In [257]:
# add score column 
entity_people['SCORE']  = entities.apply(get_ratio, axis=1)

entity_people.head()


[  2   4 100]
[100 100 100]
[100 100 100]
[100 100 100]
[  2   4 100]
[  3   4 100]
[100 100 100]
[  5   4 100]
[  2   4 100]
[100 100 100]
[  2   4 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[  1   4 100]
[  2   4 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[100 100 100]
[  1   4 100]
[100 100 100]
[100 100 100]
[  2   4 100]
[100 100 100]
[  2 100 100]
[100 100 100]
[  1 100 100]
[  1 100 100]
[100 100 100]
[  1 100 100]
[  1 100 100]
[  1   4 100]
[100 100 100]
[100 100 100]
[100 100 100]
[  3   4 100]
[  4   4 100]
[  2   4 100]
[  4 100 100]
[  3 100 100]
[  2 100 100]
[3 4 4]
[1 4 4]
[  3 100   4]
[  1 100   4]
[  3 100   4]
[2 4 4]
[4 4 4]
[2 4 4]
[  3 100   4]
[2 4 4]
[5 4 4]
[2 4 4]
[  3 100   4]
[  2 100   4]
[3 4 4]
[  1 100   4]
[  1 100   4]
[  2 100   4]
[  2

[4 4 4]
[3 4 4]
[4 4 4]
[1 4 4]
[3 4 4]
[  2 100   4]
[  3 100   4]
[3 4 4]
[2 4 4]
[6 4 4]
[  1 100   4]
[2 4 4]
[3 4 4]
[  1 100   4]
[  1 100   4]
[  1 100   4]
[  1 100   4]
[  2 100   4]
[  2 100   4]
[2 4 4]
[  1 100   4]
[  2 100   4]
[4 4 4]
[4 4 4]
[  2 100   4]
[  1 100   4]
[  1 100   4]
[  2 100   4]
[  1 100   4]
[  2 100   4]
[3 4 4]
[  1 100   4]
[  1 100   4]
[6 4 4]
[1 4 4]
[  3 100   4]
[4 4 4]
[  3 100   4]
[1 4 4]
[2 4 4]
[2 4 4]
[3 4 4]
[  1 100   4]
[  3 100   4]
[  3 100   4]
[  1 100   4]
[2 4 4]
[100 100   4]
[2 4 4]
[3 4 4]
[100 100   4]
[  1 100   4]
[1 4 4]
[  1 100   4]
[100 100   4]
[100 100   4]
[  2 100   4]
[100 100   4]
[100 100   4]
[100 100   4]
[  1 100   4]
[  2 100   4]
[100 100   4]
[  2 100   4]
[  2 100   4]
[100 100   4]
[  2 100   4]
[100 100   4]
[  2 100   4]
[  2 100   4]
[1 4 4]
[1 4 4]
[5 4 4]
[100 100   4]
[100 100   4]
[1 4 4]
[  3 100   4]
[0 4 4]
[2 4 4]
[  3 100   4]
[0 4 4]
[0 4 4]
[1 4 4]
[3 4 4]
[1 4 4]
[1 4 4]
[0 4 4]
[1 4 4]
[1

Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1,27303857,331842,english.all.3class.distsim.crf.ser,DANIELFISH,PERSON,85567,1,2,100.0
2,27303858,331842,english.all.3class.distsim.crf.ser,RODGERS,PERSON,70833,1,2,100.0
3,27303859,331842,english.all.3class.distsim.crf.ser,HAMMERSTEIN,PERSON,98182,1,2,100.0
6,27303863,331842,english.all.3class.distsim.crf.ser,DAMONDAUNNO,PERSON,546335,3,1,100.0
9,27303870,331842,english.all.3class.distsim.crf.ser,FISH,PERSON,85567,6,2,100.0


In [240]:
# entities.head(50)

In [253]:
entity_people.head()
entity_people = entity_people.assign(name=entities['name'].str.upper().str.replace('\W+', ''))
# entities.loc[entities['type'] == 'PERSON'].sort_values('SCORE',ascending=True).head(70)
entity_people.loc[entities['type']=='PERSON'].sort_values('SCORE',ascending=False).head(70)


Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1,27303857,331842,english.all.3class.distsim.crf.ser,DANIELFISH,PERSON,85567,1,2,100.0
2339,45035693,423290,english.all.3class.distsim.crf.ser,TRUMP,PERSON,167,15,2,100.0
2353,44986718,423139,english.all.3class.distsim.crf.ser,MUSK,PERSON,1899,6,1,100.0
2350,44986713,423139,english.all.3class.distsim.crf.ser,ELONMUSK,PERSON,1899,2,1,100.0
2348,45035711,423290,english.all.3class.distsim.crf.ser,ADAMSHAW,PERSON,27998,19,1,100.0
2345,45035706,423290,english.all.3class.distsim.crf.ser,MORGAN,PERSON,82222,17,1,100.0
2344,45035705,423290,english.all.3class.distsim.crf.ser,JEANINE,PERSON,46794,17,1,100.0
2342,45035703,423290,english.all.3class.distsim.crf.ser,JEANINEPIRRO,PERSON,46794,17,1,100.0
2340,45035696,423290,english.all.3class.distsim.crf.ser,TRUMP,PERSON,167,16,1,100.0
2335,45035685,423290,english.all.3class.distsim.crf.ser,MCALEENAN,PERSON,21100,12,1,100.0
