In [48]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm


# Exploring the data & Preprocessing

### We can do a combination of the following for data normalization: 
1. Convert all entities to upper or lower case, and remove whitespace
2. Run a spell checker to remove known typographical errors 
3. Replace nicknames, and expand abbreviations 
4. Perform looksups in lexicons 
5. Tokenize, Stem, or Lemmatize words 

### We can do a combination of the following for missing values: 
1. Set to Nan, Null, or remove 
2. Missing entries can also be filled by aggregating other fields or taking means 


entities = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/named_resolution.csv")
articles = pd.read_json("/Users/Rong/Documents/USF/EntityResolution/Model1/articles.json")


In [49]:
print(entities.head()[['article_id','name','type','paragraph']])

   article_id         name      type  paragraph
0      331842     Broadway  LOCATION          1
1      331842  Daniel Fish    PERSON          1
2      331842      Rodgers    PERSON          1
3      331842  Hammerstein    PERSON          1
4      331842     Oklahoma  LOCATION          1


In [50]:
# print(articles.head(5)[['content', 'id']])

# Get all entities that are people only 
entity_people = entities[entities['type'] == 'PERSON']
print(entity_people.head(10)[['article_id','name','type','paragraph']])

# Capitalise all names and remove spaces and non alphabetical characters
entity_people_names_CAPITALS = entity_people['name'].str.upper().str.replace('\W+', '')
entity_people_names_CAPITALS.head(10)

    article_id                  name    type  paragraph
1       331842           Daniel Fish  PERSON          1
2       331842               Rodgers  PERSON          1
3       331842           Hammerstein  PERSON          1
6       331842          Damon Daunno  PERSON          3
9       331842                  Fish  PERSON          6
11      331842  Oscar Hammerstein II  PERSON          7
12      331842                  Fish  PERSON          8
13      331842           Trevor Nunn  PERSON          8
14      331842         Susan Stroman  PERSON          8
17      331842            Lynn Riggs  PERSON          9


1             DANIELFISH
2                RODGERS
3            HAMMERSTEIN
6            DAMONDAUNNO
9                   FISH
11    OSCARHAMMERSTEINII
12                  FISH
13            TREVORNUNN
14          SUSANSTROMAN
17             LYNNRIGGS
Name: name, dtype: object

In [71]:
entity_people_names_CAPITALS.duplicated().head(20)  

1     False
2     False
3     False
6     False
9     False
11    False
12     True
13    False
14    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
Name: name, dtype: bool

# How to obtain distributed representations of words 

There are several methods to compute the DRs of words: 
1. word2Vec https://github.com/maxoodf/word2vec
2. GloVe https://nlp.stanford.edu/projects/glove/
3. fastText https://fasttext.cc/




# Model 1 - Clustering entities with GloVe

#### GloVe = Global Vectors for Word Representation

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.




#### Inputs: Entity 1, Context String Entity 1, Entity 2, Context String Entity 2 
#### Outputs: True/False 


#### Preprocessing 
Convert all Entities into lower or upper case format
Select only entities that are categorised as 'People'

#### Clustering
for each (Entity + Context String) in the dataset 
        if we find a match 
            add it to the Map 
        else 
            create new entry in the Map 
        
#### Post Processing 
Select the the entity representative of each bucket

    

# Model 2 - Clustering entities in the Naive Approach

#### 1. Levenshtein 
#### 2. Affine Gap Distance 
#### 3. Jaro–Winkler distance

In [72]:
dict = {} 
for word in entity_people_names_CAPITALS:
    if dict.get(word) != None:
        dict[word] += 1
    else:
        dict[word] = 1
        
# print(dict.items())

from pandas import DataFrame

df = DataFrame(list(dict.items()), columns = ['column1','column2']) 
# df.head(20)

for word1 in df['column1']:
    for word2 in df['column1']:
        value = levenshtein(word1, word2)
        if (value > 0 and value < 3):
            print(word1 + " " + word2)


JUD DAD
VAILL BRILL
VAILL HILL
VAILL BILL
DAVIS DAISY
BRILL VAILL
BRILL HILL
BRILL BILL
JONES NUNES
WILLIAMS WILLIAM
STEVEMNUCHIN STEVENMNUCHIN
FOX ROE
FOX COX
HOYER DYER
HOYER HAYES
CORMAN HERMAN
CORMAN JORDAN
CORMAN MORGAN
CORMAN HOMAN
DAD JUD
DAD WADE
LEAR LEARS
LEAR AZAR
LEAR LARA
VIC TIM
VIC TIA
VIC ERIC
LEARS LEAR
LEARS LARA
REGAN MEGHAN


KeyboardInterrupt: 

#### Feature 1 - Levenshtein

In [25]:
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
#     print (matrix)
    return (matrix[size_x - 1, size_y - 1])

In [27]:
print(levenshtein("Hello", "Bebbo"))

3.0


#### Feature 2 - Affine Gap Distance