In [13]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from datetime import date
import re
from datetime import datetime

# Entity Resolution Problem Statement

Journalists, academics, and businesses work hard to get big masses of data to learn about what people or organizations are doing. Unfortunately, once we get the data, we often can’t answer our questions because we can’t tell who is who.

In much real-world data, we do not have a way of absolutely deciding whether two records, say David Guy-Brizan and D Guy-Brizan are referring to the same person. If these were records of Professors at the University of San Francisco data, did a David Guy-Brizan give two lectures on two separate topics or did Desmond Guy-Brizan give the lecture on the second topic? Perhaps it could even be on completely separate topics.

People are pretty good at making these calls, if they have enough information. For example, I would be pretty confident that the following two records are the about the same person.

|first name | last name | Topic                   | hours   |
| --- | --- | --- |
|David      | Guy-Brizan   | Machine Learning   | 2pm - 4pm Thurs |
|D          | Guy-Brizan   | Algorithms |   2pm - 4pm Tuesdays|

If we have to decide which records in our data are about the same person or organization, then we could just go through by hand, compare every record, and decide which records are about the same entity.

This is very, very boring and can take a long time. 

## What is Entity Resolution

Entity Resolution is the task of disambiguating manifestations of real world entities in various records or mentions by linking and grouping. For example, there could be different ways of addressing the same person in text, different addresses for businesses, or photos of a particular object.

# Exploring the data & Preprocessing

### We can do a combination of the following for data normalization: 
1. Convert all entities to upper or lower case, and remove whitespace
2. Run a spell checker to remove known typographical errors 
3. Replace nicknames, and expand abbreviations 
4. Perform looksups in lexicons 
5. Tokenize, Stem, or Lemmatize words 
6. Break up words to multiple columns

### We can do a combination of the following for missing values: 
1. Set to Nan, Null, or remove 
2. Missing entries can also be filled by aggregating other fields or taking means 

# Reading in the data

In [14]:
# Read in the data
entities = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/sample_data_from_Anant/named_resolution_original.csv")
articles = pd.read_json("/Users/Rong/Documents/USF/EntityResolution/Model1/sample_data_from_Anant/articles.json")


In [15]:
# View data head

print(entities.head(5))

# Select only relevant columns for Model 1 
print(entities.head()[['article_id','name','type','paragraph']])

         id  article_id                               model         name      type  entity_id  paragraph  sentence
0  27303856  331842      english.all.3class.distsim.crf.ser  Broadway     LOCATION  2551       1          1       
1  27303857  331842      english.all.3class.distsim.crf.ser  Daniel Fish  PERSON    85567      1          2       
2  27303858  331842      english.all.3class.distsim.crf.ser  Rodgers      PERSON    70833      1          2       
3  27303859  331842      english.all.3class.distsim.crf.ser  Hammerstein  PERSON    98182      1          2       
4  27303860  331842      english.all.3class.distsim.crf.ser  Oklahoma     LOCATION  1332       1          2       
   article_id         name      type  paragraph
0  331842      Broadway     LOCATION  1        
1  331842      Daniel Fish  PERSON    1        
2  331842      Rodgers      PERSON    1        
3  331842      Hammerstein  PERSON    1        
4  331842      Oklahoma     LOCATION  1        


# Work with PERSON entries

In [16]:
# print(articles.head(5)[['content', 'id']])

# Get all entities that are type PERSON only 
entity_people = entities[entities['type'] == 'PERSON']
print(entity_people.head(10)[['article_id','name','type','paragraph']])

# Capitalise all names and remove spaces and non alphabetical characters
entity_people_names_CAPITALS = entity_people['name'].str.upper().str.replace('\W+', '')
entity_people_names_CAPITALS.head(10)

    article_id                  name    type  paragraph
1   331842      Daniel Fish           PERSON  1        
2   331842      Rodgers               PERSON  1        
3   331842      Hammerstein           PERSON  1        
6   331842      Damon Daunno          PERSON  3        
9   331842      Fish                  PERSON  6        
11  331842      Oscar Hammerstein II  PERSON  7        
12  331842      Fish                  PERSON  8        
13  331842      Trevor Nunn           PERSON  8        
14  331842      Susan Stroman         PERSON  8        
17  331842      Lynn Riggs            PERSON  9        


1     DANIELFISH        
2     RODGERS           
3     HAMMERSTEIN       
6     DAMONDAUNNO       
9     FISH              
11    OSCARHAMMERSTEINII
12    FISH              
13    TREVORNUNN        
14    SUSANSTROMAN      
17    LYNNRIGGS         
Name: name, dtype: object

In [17]:
# check to see if we have duplicate Names
entity_people_names_CAPITALS.duplicated().head(20)  

## YES, WE DO!

1     False
2     False
3     False
6     False
9     False
11    False
12    True 
13    False
14    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
Name: name, dtype: bool

# Word2Vec: Libraries for distributed representations of words 

There are several methods to compute the DRs of words: 
1. word2Vec https://github.com/maxoodf/word2vec
2. GloVe https://nlp.stanford.edu/projects/glove/
3. fastText https://fasttext.cc/




# Clustering entities with GloVe

#### GloVe = Global Vectors for Word Representation

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.




#### Inputs: Entity 1, Context String Entity 1, Entity 2, Context String Entity 2 
#### Outputs: True/False 


#### Preprocessing 
Convert all Entities into lower or upper case format
Select only entities that are categorised as 'People'

#### Clustering
for each (Entity + Context String) in the dataset 
        if we find a match 
            add it to the Map 
        else 
            create new entry in the Map 
        
#### Post Processing 
Select the the entity representative of each bucket

    

Code Deprecated

# FuzzyWuzzy Playground
Using simple Fuzzy Wuzzy string matching algorithms such as Levenshtein Distance to calculate the differences between sequences in a simple-to-use package.

fuzzywuzzy library where we can have a score out of 100, that denotes two string are equal by giving similarity index. This article talks about how we start using fuzzywuzzy library.


- https://www.geeksforgeeks.org/fuzzywuzzy-python-library/

- https://pypi.org/project/fuzzywuzzy/

In [18]:
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process


In [19]:
# for each name, check for similar entries
def get_ratio(row): 
    name = fuzz.token_set_ratio(entity_people['name'], row['name'])
    entity_type = fuzz.token_set_ratio(entity_people['type'], row['type'])
    article_id = fuzz.token_set_ratio(entity_people['article_id'], row['article_id'])
    
# Set weights
#     weights = [8.0, 2.0]        
#     return np.average(np.array([name, entity_type]), weights=weights)
    return np.average(np.array([name, entity_type, article_id]))

In [20]:
# add score column 
entity_people['SCORE'] = entities.apply(get_ratio, axis=1)

entity_people.head()

Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1,27303857,331842,english.all.3class.distsim.crf.ser,Daniel Fish,PERSON,85567,1,2,100.0
2,27303858,331842,english.all.3class.distsim.crf.ser,Rodgers,PERSON,70833,1,2,100.0
3,27303859,331842,english.all.3class.distsim.crf.ser,Hammerstein,PERSON,98182,1,2,100.0
6,27303863,331842,english.all.3class.distsim.crf.ser,Damon Daunno,PERSON,546335,3,1,100.0
9,27303870,331842,english.all.3class.distsim.crf.ser,Fish,PERSON,85567,6,2,100.0


In [21]:
# Convert all entities to Upper case and remove white space
entity_people = entity_people.assign(name=entities['name'].str.upper().str.replace('\W+', ''))

# Sort scores in descending order
entity_people.loc[entities['type'] == 'PERSON'].sort_values('SCORE',ascending=True).head(5)
# entity_people.loc[entities['type']=='PERSON'].sort_values('SCORE',ascending=False).head(70)


Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1089,45270731,423929,english.all.3class.distsim.crf.ser,DOUBT,PERSON,886957,12,1,35.0
1059,45269526,423928,english.all.3class.distsim.crf.ser,RIVAS,PERSON,886939,9,1,35.0
2035,45386277,424545,english.all.3class.distsim.crf.ser,ABRAMS,PERSON,888323,5,1,35.0
2042,45041895,423294,english.all.3class.distsim.crf.ser,OBAMA,PERSON,319,3,2,35.0
1022,45272338,423931,english.all.3class.distsim.crf.ser,RIVERA,PERSON,178524,7,1,35.0


In [22]:
# View sample entry: DonaldTrump
process.extract("DONALDTRUMP", entity_people.name, scorer=fuzz.token_sort_ratio)

[('DONALDTRUMP', 100, 528),
 ('DONALDTRUMP', 100, 864),
 ('DONALDTRUMP', 100, 1112),
 ('DONALDTRUMP', 100, 2104),
 ('DONALDJTRUMP', 96, 1893)]

# Entity Resolution with Dedupe.io

In [111]:
entities.head()

# Get all data related to people only
entity_people = entities[entities['type'] == 'PERSON']

# print(entity_people[entity_people.name == 'HAMMERSTEIN'])
entity_people.head()

Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence
1,27303857,331842,english.all.3class.distsim.crf.ser,Daniel Fish,PERSON,85567,1,2
2,27303858,331842,english.all.3class.distsim.crf.ser,Rodgers,PERSON,70833,1,2
3,27303859,331842,english.all.3class.distsim.crf.ser,Hammerstein,PERSON,98182,1,2
6,27303863,331842,english.all.3class.distsim.crf.ser,Damon Daunno,PERSON,546335,3,1
9,27303870,331842,english.all.3class.distsim.crf.ser,Fish,PERSON,85567,6,2


In [112]:
print("Shape of entities : {}".format(entities.shape))
print("Shape of entities : {}".format(entity_people.shape))

# Here we see that out of the 2391 data entries that we have, 1222 of those are classified as a PERSON.
# For our first model, we will use the dedupe library on type=PERSON


Shape of entities : (2391, 8)
Shape of entities : (1222, 8)


In [113]:
import logging; logging.disable(level=logging.NOTSET)

## About Dedupe
dedupe is a python library that uses machine learning to perform fuzzy matching, deduplication and entity resolution quickly on structured data.

dedupe will help you:

remove duplicate entries from a spreadsheet of names and addresses
link a list with customer information to another with order history, even without unique customer IDs
take a database of campaign contributions and figure out which ones were made by the same person, even if the names were entered slightly differently for each record
dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases.

The dedupe open source python library is what we will use to work with our dataset:
https://github.com/dedupeio/dedupe




## How we are using Dedupe
Dedupe will help us remove duplicate entries from our people dataset of name entities, whilst taking into account the paragraph information which provides context to the named entities. 

Dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases.


## Variable Types
A variable definition describes the records that you want to match. It is a dictionary where the keys are the fields and the values are the field specification. For our example, we will use the following:

#### String Types
String types are compared using affine gap string distance.

#### Text Types
Fields containing long blocks of text e.g. product descriptions or article abstracts we use the Text type fields. These are compared using the cosine similarity metric.

This is a measurement of the amount of words that two documents have in common. This measure can be made more useful as the overlap of rare words counts more than the overlap of common words.

If provided a sequence of example fields (i.e. a corpus) then dedupe will learn these weights for you.

#### Name Types
A Name variable should be used for a field that contains American names, corporations and households. It uses the probablepeople package to split apart a name string into components like give name, surname, generational suffix, for people names, and abbreviation, company type, and legal form for corporations.

## How it works 

Choose column types --> Train Model --> Review Clusters --> Add to clusters --> Polish Clusters

## Building blocks


#### Record similarity
We make the assumpiton that records that are more similar are more likely to be duplicates. 

The default way that this is done in Dedupe is to use what’s called a string metric. A string metric is a way of taking two strings and returning a number that is low if the strings are similar and high if they are dissimilar. 

There are lots of different string metrics, and we actually use a metric called the Affine Gap Distance. It counts the number of substitutions that must be made to turn one string into another. It is similar to our familiar Levenshtein distance, that is used in Fuzzy Wuzzy.


#### Regularized logistic regression
If we supply pairs of records that we label as either being duplicates or distinct, then Dedupe will learn a set of weights such that the record distance can easily be transformed into our best estimate of the probability that a pair of records are duplicates.

Once we have learned these good weights, we want to use them to find which records are duplicates. But turns out that doing this the naive way will usually not work, and we’ll have to do something smarter.


#### Active Learning
In order to learn those weights, Dedupe needs example pairs with labels. Most of the time, we will need to supply those labels.

But the whole point of Dedupe is to save people’s time, and that includes making good use of labeling time so we use an approach called Active Learning.

To do this, we maintain a set of the pairs where there is disagreement: that is pairs which classifier believes are duplicates but which are not covered by the current blocking rules, and the pairs which the classifier believes are distinct but which are blocked together.

Dedupe picks, at random from this disagreement set, a pair of records and asks the user to decide. Once it gets this label, it relearns the weights and blocking rules. We then recalculate the disagreement set.






# Measuring Entity Resolution Results 

Accuracy
- Precise record clusters -- we don’t want to link records that don’t belong together
- Clusters with high or complete recall -- we want everything associated with that entity

Auditability
When linking records together into clusters, we will also want to trace our steps so if later there’s a problem, we go back and figure what you did and correct it.

I've included the training data where useful in the Github repository of the project.




# Model 1 - Dedupe.io with fields: name, article_id, and abs_paragraph

In [23]:
fields = [
{
"crf": True,
"type": "Name",
"field": "abs_name",
"log file": "/tmp/name.csv"
},
{
"type": "PositiveNumber",
"field": "abs_article_id"
},
{
"type": "PositiveNumber",
"field": "abs_paragraph"
}

]

# Our results for Model 1


In [28]:
deduped_people = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/persons/DedupedPersons.csv")

grouped_clusterId_model1 = deduped_people.groupby('cluster_id')
# grouped_name = deduped_people.groupby('name')

trump = grouped_clusterId_model1.get_group('63758edf-0cdc-4c6e-bc20-ec1823f54dfb')

print(len(trump))
print(trump)

# grouped_name.get_group('Trump')

56
                               cluster_id        id  article_id                               model          name    type  entity_id  paragraph  sentence
373  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  45273599  423932      english.all.3class.distsim.crf.ser  Trump         PERSON  167        5          2       
374  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  45289264  424080      english.all.3class.distsim.crf.ser  Trump         PERSON  167        13         3       
375  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  45289328  424080      english.all.3class.distsim.crf.ser  Trump         PERSON  167        27         3       
376  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  45289377  424080      english.all.3class.distsim.crf.ser  Trump         PERSON  167        38         2       
377  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  31202403  320418      english.all.3class.distsim.crf.ser  Trump         PERSON  167        4          1       
378  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  45345139  424388      english.

In [29]:
# Find the total number of names relative to number of clusters
all_names = deduped_people['name']

#Find names
names_dict = {}

total = 0
for name in all_names:
        if name not in names_dict:
            names_dict[name] = total
            total = total + 1        
    
# for key, value in names_dict.items() :
#     print (key, value)

clusters = deduped_people['cluster_id']

# Find clusters
cluster_dict = {}

total = 0
for name in clusters:
        if name not in cluster_dict:
            cluster_dict[name] = total
            total = total + 1        
    
# for key, value in cluster_dict.items() :
#     print (key, value)

print("Number of names", len(names_dict))
print("Number of clusters", len(cluster_dict))
    

Number of names 452
Number of clusters 294


## Model 1 Overall data: 

#### Number of names: 452

#### Number of clusters: 294


In [44]:
# A function to calculate the number of entries in each cluster
def clusterIdToCount( dataframe ):
    mappedItems = {}
    for index, row in dataframe.iterrows():
        clusterId = row['cluster_id']
        if clusterId not in mappedItems:
            mappedItems[clusterId] = 1
        else:
            mappedItems[clusterId] += 1
            
    # for key, value in clusterToCount_Model1.items():
#     print (key, value)

    result = pd.DataFrame(list(mappedItems.items()))
    
    return result


In [35]:
# Create a dictionary that maps ClusterId's to number of occurences
# print(deduped_people.head(10)[['cluster_id','name']])

clusterIdWithName = deduped_people[['cluster_id','name']]
# print(clusterIdWithName)
# print(type(clusterIdWithName))

clusterToCount_Model1 = clusterIdToCount(clusterIdWithName)

clusterToCount_Model1.head(10)

# a scatter plot comparing
# deduped_people.plot(kind='scatter',x='name',y='cluster_id',color='red')
# plt.show()
# Deprecated

Unnamed: 0,0,1
0,0074c072-db24-469b-ae55-cd37b0637e14,1
1,02146eca-2281-4d98-b7f7-3838e3eb08f7,5
2,031da3d0-179b-43be-a47f-d73dd6d2367f,1
3,054341de-275b-4c30-b6a0-9d3c262bdbb7,2
4,0547d4c7-8c90-477b-bbac-be43c60ad5fd,2
5,066e8640-3995-4436-8a87-cbf3274dce3c,1
6,06840a8a-ba96-48d7-a8cc-0299498be89b,1
7,068b4c90-8e50-4e4d-b510-42e9b54762e3,1
8,06aa896b-ed2b-414e-a293-b5c54a02b6c3,3
9,06b17667-dc3b-4405-b7dc-60aecedbe850,16


# Model 2 - Person data with the associated Article as input

For our second model, we include the article content associated with the entity to give us greater context when applying Machine Learning.

In [37]:
#Preprocess and prepare article data with persons data
articles.head(10)[['content','description','id']]
# print(articles.head())

# create dictionary for articles

article_id = {}

count = 0
duplicate = 0
for index, row in articles.iterrows():
        if row['id'] not in article_id:
            article_id[row['id']] = row['content']
            count = count + 1
        else:
            duplicate = duplicate + 1
print(count)        
print(duplicate)
len(article_id)


100
0


100

In [38]:
entity_people.head()

Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE
1,27303857,331842,english.all.3class.distsim.crf.ser,DANIELFISH,PERSON,85567,1,2,100.0
2,27303858,331842,english.all.3class.distsim.crf.ser,RODGERS,PERSON,70833,1,2,100.0
3,27303859,331842,english.all.3class.distsim.crf.ser,HAMMERSTEIN,PERSON,98182,1,2,100.0
6,27303863,331842,english.all.3class.distsim.crf.ser,DAMONDAUNNO,PERSON,546335,3,1,100.0
9,27303870,331842,english.all.3class.distsim.crf.ser,FISH,PERSON,85567,6,2,100.0


In [39]:
#Combine article data into entity_people sample data

#create new column in dataframe
entity_people['content'] = ""

print(type(entity_people))

#fill up column 
for index, row in entity_people.iterrows():
    a_id = row['article_id']
    entity_people.at[index, 'content'] = article_id.get(a_id)


<class 'pandas.core.frame.DataFrame'>


In [40]:
entity_people.to_csv(r'/Users/Rong/Documents/USF/EntityResolution/Model1/people_articles.csv', index = False)

In [41]:
print(entity_people.shape)

(1222, 10)


In [43]:
people_articles = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/people_articles.csv")

people_articles.head(1)


Unnamed: 0,id,article_id,model,name,type,entity_id,paragraph,sentence,SCORE,content
0,27303857,331842,english.all.3class.distsim.crf.ser,DANIELFISH,PERSON,85567,1,2,100.0,"How is it that the coolest new show on Broadway in 2019 is a 1943 musical usually regarded as a very square slice of American pie? The answer arrives before the first song is over in Daniel Fish's wide-awake, jolting and altogether wonderful production of ""Rodgers and Hammerstein's Oklahoma!,"" which opened on Sunday night at the Circle in the Square Theater.\n""Oh, What a Beautiful Mornin'"" is the title and the opening line of this familiar number, a paean to a land of promisingly blue skies and open spaces. But Curly, the cowboy who sings it, isn't cushioned by the expected lush orchestrations. Nor is the actor playing him your usual solid slab of beefcake with a strapping tenor.\nAs embodied by the excellent Damon Daunno, this lad of the prairies is wiry and wired, so full of unchanneled sexual energy you expect him to implode. There's the hint of a wobble in his cocky strut and voice.\nDoing his best to project a confidence he doesn't entirely feel, to the accompaniment of a down-home guitar, he seems so palpably young. As is often true of big boys with unsettled hormones, he also reads as just a little dangerous.\nHe's a lot like the feisty, ever-evolving nation he's so proud to belong to. That would be the United States of America, then and now.\nMaking his Broadway debut as a director, Mr. Fish has reconceived a work often seen as a byword for can-do optimism as a mirror for our age of doubt and anxiety. This is ""Oklahoma!"" for an era in which longstanding American legacies are being examined with newly skeptical eyes.\nSuch a metamorphosis has been realized with scarcely a changed word of Oscar Hammerstein II's original book and lyrics. This isn't an act of plunder, but of reclamation. And a cozy old friend starts to seem like a figure of disturbing -- and exciting -- depth and complexity.\nMr. Fish's version isn't the first ""Oklahoma!"" to elicit the shadows from within the play's sunshine. Trevor Nunn and Susan Stroman's interpretation for London's National Theater of nearly two decades ago, while more traditionally staged, also scaled up the disquieting erotic elements.\nBut this latest incarnation goes much further in digging to a core of fraught ambivalence. To do so, it strips ""Oklahoma!"" down to its skivvies, discarding the picturesque costumes and swirling orchestrations, and revealing a very human body that belongs to our conflicted present as much as it did to 1943 or to 1906, the year in which the show (based on Lynn Riggs's ""Green Grow the Lilacs"") takes place.\nLaura Jellinek's set suggests a small-town community center that might double as a polling station, decorated with festive banners, colored lights -- and a full arsenal of guns on the walls. It's made clear that we the audience are part of this community. The house lights stay on for much of the show, in a homogenizing brightness, that is occasionally and abruptly changed for pitch darkness. (Scott Zielinski is the first-rate lighting designer.)\nThere's chili cooking on the refectory tables onstage, for the audience's consumption at intermission. A seven-member hootenanny-style band sits in plain view. The well-known melodies they play have been reimagined -- by the brilliant orchestrator and arranger Daniel Kluger -- with the vernacular throb and straightforwardness of country and western ballads.\nThe cast members -- wearing a lot of good old, form-fitting denim (Terese Wadden did the costumes) -- are just plain folks. Singing with conversational ease, they occasionally flirt and joke with the audience seated on either side of the stage. We are all, it would appear, in this together.\nThough the cast has been whittled down to 11 speaking parts (and one dancer), the key characters are very much present. They include our scrapping leading lovers, Curly McLain and Laurey Williams (Rebecca Naomi Jones); their comic counterparts, Will Parker (James Davis) and Ado Annie (Ali Stroker); that bastion of homespun wisdom and stoicism, Aunt Eller (Mary Testa) and the womanizing peddler Ali Hakim (Will Brill).\nOh, I almost forgot poor old Jud Fry (Patrick Vaill), the slightly, well, weird handyman who's sweet, in a sour way, on Laurey. Everybody forgets Jud, or tries to. Not that this is possible, with Mr. Vaill lending a charismatic, hungry loneliness to the part that's guaranteed to haunt your nightmares.\nThese people -- in some cases nontraditionally yet always perfectly cast -- intersect much as they usually do in ""Oklahoma!"" They court and spark, fight and reunite. They also show off by picking up guitars and microphones and dancing like prairie bacchantes. (John Heginbotham did the spontaneous-feeling choreography.) They use household chores, like shucking corn, to memorably annotative effect.\nMs. Stroker's boy-crazy, country siren-voiced Ado Annie, who rides a wheelchair as if it were a prize bronco, and Mr. Davis's deliciously dumb Will emanate a blissful endorphin haze. Mr. Brill is a refreshingly unmannered Ali Hakim, and Ms. Testa is a splendid, wryly authoritative Aunt Eller.\nBut there's an abiding tension. This is especially evident in Ms. Jones's affectingly wary Laurey, who regards her very different suitors, Curly and Jud, with a confused combination of desire and terror.\nThat her fears are not misplaced becomes clear in an encounter in Jud's dank hovel of a home. Curly sings ""Pore Jud,"" in which he teasingly imagines his rival's funeral with an ominous breathiness.\nThe scene occurs in darkness, with a simulcast video in black and white of the two men face to face. And the lines between sex and violence, already blurred in this gun-toting universe, melt altogether.\nI first saw Mr. Fish's ""Oklahoma!"" at Bard College in 2015, and again at St. Ann's Warehouse in Brooklyn last year. It was an exciting work from the get-go, but it just keeps getting better. The performances are looser and bigger; they're Broadway-size now, with all the infectious exuberance you expect from a great musical.\nAt the same time, though, this production reminds us that such raw energy can be harnessed to different ends, for ill as well as for good. In the earlier versions, I had problems with its truly shocking conclusion -- the scene that takes the most liberties with the original. In its carefully retooled rendering, it's disturbing for all the right reasons.\nThe other significant change here involves the dream ballet, which in this version begins the second act and has been newly varied and paced. It is performed by one dancer (the exquisite Gabrielle Hamilton) with a shaved head and a glittering T-shirt that reads ""Dream Baby Dream.""\nWhat she does is a far cry from the same sequence as immortalized by Agnes de Mille, the show's legendary original choreographer. But on its own, radically reconceptualized terms, it achieves the same effect.\nAs she gallops, slithers and crawls the length of the stage, casting wondering and seductive glances at the front row, Ms. Hamilton comes to seem like undiluted id incarnate, a force that has always been rippling beneath the surface here.\nShe's as stimulating and frightening -- and as fresh -- as last night's fever dream. So is this astonishing show.\n"


## Applying Dedupe with additional fields

In [126]:
fields = [
{
"type": "Name",
"field": "name",
},
{
"type": "PositiveNumber",
"field": "article_id"
},
{
"type": "PositiveNumber",
"field": "paragraph"
},
{
"type": "PositiveNumber",
"field": "sentence"
},
{
"type": "Text",
"field": "content"
}
]

# Our results for Model 2 

In [46]:
deduped_people_with_articles = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/persons/people_articles_results.csv")

deduped_people_with_articles.head()

Unnamed: 0,cluster_id,id,article_id,model,name,type,entity_id,paragraph,sentence,content
0,00285e55-b46b-4cf1-8cb6-5c4393cb34f5,45318247,424195,english.all.3class.distsim.crf.ser,Mrs Webster,PERSON,887499,12,1,"An autistic boy who had his leg amputated is battling for a secondary school place after ""falling through the cracks"" of education provision.\nDaniel Webster, 11, has missed school for two years due to illness and school refusal - also known as school phobia. \nTrina Webster said she had letters from mainstream and special schools saying they could not cater for his needs. \nDudley Council said it was committed to giving children ""the means to reach their full potential.""\nMrs Webster, of Halesowen, spoke to the BBC after seeing Newsnight's investigation into why 1,500 children with special educational needs and disabilities (Send) are without a school place in England.\nDaniel was six when doctors found he had a tumour in his hip and thigh, said Mrs Webster. \nHe was signed off from primary school in 2017 as medically unfit when he was given morphine and other drugs to manage the pain. \nIn 2018, he had to have his leg amputated. \n""He had the operation in March but he was doing so well by May we wanted to see if he was ready to go back to school, but it was too soon. He started 'school refusing',"" said Mrs Webster. \nSchool refusal is a recognised psychological condition in which children develop an inherent fear of attending school. Daniel also suffers with selective mutism, an anxiety disorder that affects the ability to speak and communicate.\nHe did not return to primary school but has been been receiving tuition from Cherry Tree Learning Centre, which helps children who have emotional and physical needs.\nNow 11, he is due to start secondary school in September, and Mrs Webster said she applied to several special needs schools.\n""Mainstream schools will be noisy and too much pressure for him,"" she said. \n""But they all turned us down. One said he would be isolated because there was no peer group for him. Another said his needs were too complex.""\nInstead, the Websters were told Daniel had a place at Earls High School in Dudley. Mrs Webster said the school had previously told her it did not have the facilities to cater for her son. \nThe family will now attend a tribunal in July at Dudley Council to try to find Daniel a suitable place at a special school. \n""I'm not sending him to a school where he's going to fail, where he's not going to cope,"" said Mrs Webster. \n""It's so frustrating. It's not just his physical needs, it's emotional too. We don't want him in a noisy, pressure-filled environment.""\nCouncillor Ruth Buttery, cabinet member for children and young people, said: ""Working in conjunction with health and education providers, we are determined to provide the best possible outcomes for pupils with Send or additional health needs and we have recently appointed two specialist education officers to support and advice parents and carers."""
1,00285e55-b46b-4cf1-8cb6-5c4393cb34f5,45318255,424195,english.all.3class.distsim.crf.ser,Mrs Webster,PERSON,887499,15,2,"An autistic boy who had his leg amputated is battling for a secondary school place after ""falling through the cracks"" of education provision.\nDaniel Webster, 11, has missed school for two years due to illness and school refusal - also known as school phobia. \nTrina Webster said she had letters from mainstream and special schools saying they could not cater for his needs. \nDudley Council said it was committed to giving children ""the means to reach their full potential.""\nMrs Webster, of Halesowen, spoke to the BBC after seeing Newsnight's investigation into why 1,500 children with special educational needs and disabilities (Send) are without a school place in England.\nDaniel was six when doctors found he had a tumour in his hip and thigh, said Mrs Webster. \nHe was signed off from primary school in 2017 as medically unfit when he was given morphine and other drugs to manage the pain. \nIn 2018, he had to have his leg amputated. \n""He had the operation in March but he was doing so well by May we wanted to see if he was ready to go back to school, but it was too soon. He started 'school refusing',"" said Mrs Webster. \nSchool refusal is a recognised psychological condition in which children develop an inherent fear of attending school. Daniel also suffers with selective mutism, an anxiety disorder that affects the ability to speak and communicate.\nHe did not return to primary school but has been been receiving tuition from Cherry Tree Learning Centre, which helps children who have emotional and physical needs.\nNow 11, he is due to start secondary school in September, and Mrs Webster said she applied to several special needs schools.\n""Mainstream schools will be noisy and too much pressure for him,"" she said. \n""But they all turned us down. One said he would be isolated because there was no peer group for him. Another said his needs were too complex.""\nInstead, the Websters were told Daniel had a place at Earls High School in Dudley. Mrs Webster said the school had previously told her it did not have the facilities to cater for her son. \nThe family will now attend a tribunal in July at Dudley Council to try to find Daniel a suitable place at a special school. \n""I'm not sending him to a school where he's going to fail, where he's not going to cope,"" said Mrs Webster. \n""It's so frustrating. It's not just his physical needs, it's emotional too. We don't want him in a noisy, pressure-filled environment.""\nCouncillor Ruth Buttery, cabinet member for children and young people, said: ""Working in conjunction with health and education providers, we are determined to provide the best possible outcomes for pupils with Send or additional health needs and we have recently appointed two specialist education officers to support and advice parents and carers."""
2,00285e55-b46b-4cf1-8cb6-5c4393cb34f5,45318233,424195,english.all.3class.distsim.crf.ser,Trina Webster,PERSON,887497,3,1,"An autistic boy who had his leg amputated is battling for a secondary school place after ""falling through the cracks"" of education provision.\nDaniel Webster, 11, has missed school for two years due to illness and school refusal - also known as school phobia. \nTrina Webster said she had letters from mainstream and special schools saying they could not cater for his needs. \nDudley Council said it was committed to giving children ""the means to reach their full potential.""\nMrs Webster, of Halesowen, spoke to the BBC after seeing Newsnight's investigation into why 1,500 children with special educational needs and disabilities (Send) are without a school place in England.\nDaniel was six when doctors found he had a tumour in his hip and thigh, said Mrs Webster. \nHe was signed off from primary school in 2017 as medically unfit when he was given morphine and other drugs to manage the pain. \nIn 2018, he had to have his leg amputated. \n""He had the operation in March but he was doing so well by May we wanted to see if he was ready to go back to school, but it was too soon. He started 'school refusing',"" said Mrs Webster. \nSchool refusal is a recognised psychological condition in which children develop an inherent fear of attending school. Daniel also suffers with selective mutism, an anxiety disorder that affects the ability to speak and communicate.\nHe did not return to primary school but has been been receiving tuition from Cherry Tree Learning Centre, which helps children who have emotional and physical needs.\nNow 11, he is due to start secondary school in September, and Mrs Webster said she applied to several special needs schools.\n""Mainstream schools will be noisy and too much pressure for him,"" she said. \n""But they all turned us down. One said he would be isolated because there was no peer group for him. Another said his needs were too complex.""\nInstead, the Websters were told Daniel had a place at Earls High School in Dudley. Mrs Webster said the school had previously told her it did not have the facilities to cater for her son. \nThe family will now attend a tribunal in July at Dudley Council to try to find Daniel a suitable place at a special school. \n""I'm not sending him to a school where he's going to fail, where he's not going to cope,"" said Mrs Webster. \n""It's so frustrating. It's not just his physical needs, it's emotional too. We don't want him in a noisy, pressure-filled environment.""\nCouncillor Ruth Buttery, cabinet member for children and young people, said: ""Working in conjunction with health and education providers, we are determined to provide the best possible outcomes for pupils with Send or additional health needs and we have recently appointed two specialist education officers to support and advice parents and carers."""
3,00285e55-b46b-4cf1-8cb6-5c4393cb34f5,45318231,424195,english.all.3class.distsim.crf.ser,Daniel Webster,PERSON,165485,2,1,"An autistic boy who had his leg amputated is battling for a secondary school place after ""falling through the cracks"" of education provision.\nDaniel Webster, 11, has missed school for two years due to illness and school refusal - also known as school phobia. \nTrina Webster said she had letters from mainstream and special schools saying they could not cater for his needs. \nDudley Council said it was committed to giving children ""the means to reach their full potential.""\nMrs Webster, of Halesowen, spoke to the BBC after seeing Newsnight's investigation into why 1,500 children with special educational needs and disabilities (Send) are without a school place in England.\nDaniel was six when doctors found he had a tumour in his hip and thigh, said Mrs Webster. \nHe was signed off from primary school in 2017 as medically unfit when he was given morphine and other drugs to manage the pain. \nIn 2018, he had to have his leg amputated. \n""He had the operation in March but he was doing so well by May we wanted to see if he was ready to go back to school, but it was too soon. He started 'school refusing',"" said Mrs Webster. \nSchool refusal is a recognised psychological condition in which children develop an inherent fear of attending school. Daniel also suffers with selective mutism, an anxiety disorder that affects the ability to speak and communicate.\nHe did not return to primary school but has been been receiving tuition from Cherry Tree Learning Centre, which helps children who have emotional and physical needs.\nNow 11, he is due to start secondary school in September, and Mrs Webster said she applied to several special needs schools.\n""Mainstream schools will be noisy and too much pressure for him,"" she said. \n""But they all turned us down. One said he would be isolated because there was no peer group for him. Another said his needs were too complex.""\nInstead, the Websters were told Daniel had a place at Earls High School in Dudley. Mrs Webster said the school had previously told her it did not have the facilities to cater for her son. \nThe family will now attend a tribunal in July at Dudley Council to try to find Daniel a suitable place at a special school. \n""I'm not sending him to a school where he's going to fail, where he's not going to cope,"" said Mrs Webster. \n""It's so frustrating. It's not just his physical needs, it's emotional too. We don't want him in a noisy, pressure-filled environment.""\nCouncillor Ruth Buttery, cabinet member for children and young people, said: ""Working in conjunction with health and education providers, we are determined to provide the best possible outcomes for pupils with Send or additional health needs and we have recently appointed two specialist education officers to support and advice parents and carers."""
4,01f2271f-ed0b-43cc-8868-2e925a734106,45301815,424087,english.all.3class.distsim.crf.ser,Trump,PERSON,167,1,1,"MSNBC's Chris Matthews has been one of the president's most unhinged critics, but even he couldn't resist praising Trump for refraining from a military strike on Iran, saying he's ""glad"" he didn't go through with it.\nPresident Trump revealed last week that he called off the strike against Iran following them shooting down a U.S. drone roughly ten minutes before the strike was set to launch after he learned that roughly 150 Iranians would have been killed.\n""If Trump had taken military action, I would have come down hard on him,"" Matthews told the panel Monday evening. ""I think it's good that he hesitated. I think hesitation before using military action and asking how many casualties there are going to be on the other side is a good question.""\nThe ""Hardball"" host invoked the lives lost during the Iraq War and how those that died ""weren't the bad guys.""\n""The fact he found out it was 150 or so people -- we went into Iraq, everybody said it was going to be quick and easy, it was going to be a slam dunk,"" Matthews elaborated.\n""We may have killed somewhere between 150,000 people and a million people, Iraqis, who were just there, they were just Iraqis, they weren't the bad guys. We only make that calculation now. It's time that -- I like the fact that the president asked up front to his NSC people: how many people will die if we take this step? I think it's good and I'm glad he did it.""\nThe Trump administration announced on Monday that the president approved ""hard-hitting"" sanctions on Iran, hoping to specifically target Supreme Leader Ayatollah Ali Khamenei and his associates."


In [47]:
#Find the number of names in each cluster
clusterIdWithName_Model2 = deduped_people_with_articles[['cluster_id','name']]
# print(clusterIdWithName.head(10))
# print(type(clusterIdWithName))

clusterToCount_Model2 = clusterIdToCount(clusterIdWithName_Model2)

print(clusterToCount_Model2.head(5))


                                      0   1
0  00285e55-b46b-4cf1-8cb6-5c4393cb34f5  4 
1  01f2271f-ed0b-43cc-8868-2e925a734106  74
2  024f034a-bb3c-4b22-8b9d-25e5609785f2  1 
3  02983e86-efa0-47b7-97f9-ed0b4a42bda6  1 
4  03952352-84e9-4c76-bee5-2e31d3652a95  1 


In [48]:
#Find the total number of names relative to number of clusters
# print(deduped_people)
all_names = deduped_people_with_articles['name']

#Find names
names_dict = {}

total = 0
for name in all_names:
        if name not in names_dict:
            names_dict[name] = total
            total = total + 1        
    
# for key, value in names_dict.items() :
#     print (key, value)

clusters = deduped_people_with_articles['cluster_id']



# Find clusters
cluster_dict = {}

total = 0
for name in clusters:
        if name not in cluster_dict:
            cluster_dict[name] = total
            total = total + 1        
    
# for key, value in cluster_dict.items() :
#     print (key, value)

print("Number of names", len(names_dict))
print("Number of clusters", len(cluster_dict))
    

Number of names 448
Number of clusters 305


## Model 2 Overall data: 

#### Number of names: 448

#### Number of clusters: 305


In [130]:
grouped_clusterId_model2 = deduped_people_with_articles.groupby('cluster_id')
# grouped_name_model2 = deduped_people_with_articles.groupby('name')

# grouped_name_model2.get_group('Trump').head(5)
trump = grouped_clusterId_model2.get_group('01f2271f-ed0b-43cc-8868-2e925a734106')


# print(len(trump))
# trump.head(74)

# Compare Model 1 & Model 2

## True number of clusters = ?

In [49]:
def compareModels(name):
     
    print("Compare values for: ", name)
    
        
#Model 2
    #Get clusterId of input name
    grouped_name_model2 = deduped_people_with_articles.groupby('name')
#     print(grouped_name_model2.get_group(name)[['cluster_id', 'name']])
    clusterId2 = grouped_name_model2.get_group(name)[['cluster_id', 'name']]
    
    #pass the id into the next lines
    model2Id = clusterId2[0:1]
    print("Model 2 Id:", model2Id)

    grouped_clusterId_model2 = deduped_people_with_articles.groupby('cluster_id')
    model2 = grouped_clusterId_model2.get_group('01f2271f-ed0b-43cc-8868-2e925a734106')['name']
    size2 = len(model2)
#     print("Size of Model 2:", size2)
    
#Model 1
    #Get clusterId of input name
#     print(deduped_people[['cluster_id','name']])

    grouped_name_model1 = deduped_people.groupby('name')
    clusterId1 = grouped_name_model1.get_group(name)[['cluster_id', 'name']]
    
    #pass the id into the next lines    
    model1Id = clusterId1[0:1]
    print("Model 1 Id:", model1Id)

    grouped_clusterId_model1 = deduped_people.groupby('cluster_id')
    model1 = grouped_clusterId_model1.get_group('63758edf-0cdc-4c6e-bc20-ec1823f54dfb')['name']
    size1 = len(model1)
#     print("Size of Model 1:", size1)
    
    return model1, model2
    

In [50]:
name = 'Trump'

result1, result2 = compareModels(name)

print("Size of Model 1:", len(result1))
print("Size of Model 2:", len(result2))


print(result1)
# print(result2)

Compare values for:  Trump
Model 2 Id:                              cluster_id   name
4  01f2271f-ed0b-43cc-8868-2e925a734106  Trump
Model 1 Id:                                cluster_id   name
373  63758edf-0cdc-4c6e-bc20-ec1823f54dfb  Trump
Size of Model 1: 56
Size of Model 2: 74
373    Trump       
374    Trump       
375    Trump       
376    Trump       
377    Trump       
378    Trump       
379    Trump       
380    Trump       
381    Trump       
382    Trump       
383    Trump       
384    Trump       
385    Trump       
386    Trump       
387    Trump       
388    Donald Trump
389    Trump       
390    Trump       
391    Trump       
392    Donald Trump
393    Trump       
394    Trump       
395    Trump       
396    Trump       
397    Trump       
398    Trump       
399    Trump       
400    Trump       
401    Trump       
402    Trump       
403    Donald Trump
404    Trump       
405    Trump       
406    Trump       
407    Trump       
408    Trump     

# Model 1 - Location data

In [51]:
[
{
"crf": True,
"type": "Name",
"field": "abs_name",
"log file": "/tmp/name.csv"
},
{
"type": "PositiveNumber",
"field": "abs_entity_id"
}
]

[{'crf': True,
  'field': 'abs_name',
  'log file': '/tmp/name.csv',
  'type': 'Name'},
 {'field': 'abs_entity_id', 'type': 'PositiveNumber'}]

## Model 1 - Location Data Results

In [52]:
location_original = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/location/Sheet 1-location.csv")
location_clustered = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/location/ClusteredLocationData.csv")

print(location_original.shape)
print(location_clustered.shape)


(652, 8)
(652, 9)


In [53]:
# Find clusters

def findClusters(data_with_clusters):
    cluster_dict = {}
    
    total = 0
    for name in data_with_clusters:
        if name not in cluster_dict:
            cluster_dict[name] = total
            total = total + 1
    
    return cluster_dict
    
# cluster_dict_locations = {}
# location_clusters = location_clustered['cluster_id']

# total = 0
# for name in location_clusters:
#         if name not in cluster_dict_locations:
#             cluster_dict_locations[name] = total
#             total = total + 1

# print("# of clusters:", len(cluster_dict_locations))

# print(cluster_dict_locations)
# print(location_clusters)

In [54]:
cluster_dict_locations = findClusters(location_clustered['cluster_id'])

print("# of clusters:", len(cluster_dict_locations))


# of clusters: 256


# Model 2 - Location data with Articles 

## Create dataset

In [55]:
def addArticlesToData(inputDataFrame):
#create new column 
    inputDataFrame['content'] = ""

    #fill up column 
    for index, row in inputDataFrame.iterrows():
        a_id = row['article_id']
        inputDataFrame.at[index, 'content'] = article_id.get(a_id)
        
    return inputDataFrame



In [56]:
location_with_articles = addArticlesToData(location_original)

print(location_with_articles.shape)

(652, 9)


In [57]:
location_with_articles.to_csv(r'/Users/Rong/Documents/USF/EntityResolution/Model1/location/location_with_articles.csv', index = False)

## Output results for Location data with Articles

In [58]:
def compareModels(name, deduped_model1, deduped_model2):
     
    print("Compare values for: ", name)
    
        
#Model 2
    #Get clusterId of input name
    grouped_name_model2 = deduped_model2.groupby('name')
#     print(grouped_name_model2.get_group(name)[['cluster_id', 'name']])
    clusterId2 = grouped_name_model2.get_group(name)[['cluster_id', 'name']]
    
    #pass the id into the next lines
    model2Id = clusterId2[0:1]
    print("Model 2 Id:", model2Id)

    grouped_clusterId_model2 = deduped_model2.groupby('cluster_id')
    model2 = grouped_clusterId_model2.get_group('01f2271f-ed0b-43cc-8868-2e925a734106')['name']
    size2 = len(model2)
    
#Model 1
    #Get clusterId of input name

    grouped_name_model1 = deduped_model1.groupby('name')
    clusterId1 = grouped_name_model1.get_group(name)[['cluster_id', 'name']]
    
    #pass the id into the next lines    
    model1Id = clusterId1[0:1]
    print("Model 1 Id:", model1Id)

    grouped_clusterId_model1 = deduped_model1.groupby('cluster_id')
    model1 = grouped_clusterId_model1.get_group('63758edf-0cdc-4c6e-bc20-ec1823f54dfb')['name']
    size1 = len(model1)
    
    return model1, model2

# Model 1 - Organization data

In [59]:
[{"crf": True, 
  "type": "Name", 
  "field": "abs_name", 
  "log file": "/tmp/name.csv"}, 
 {"type": "PositiveNumber", "field": "abs_article_id"}
]

[{'crf': True,
  'field': 'abs_name',
  'log file': '/tmp/name.csv',
  'type': 'Name'},
 {'field': 'abs_article_id', 'type': 'PositiveNumber'}]

## Model 1 - Output results Organization data

In [60]:
organization_original = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/organizations/Sheet 1-organisation.csv")
organization_deduped = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/organizations/OrgaData.csv")


cluster_dict_organization = findClusters(organization_deduped['cluster_id'])

print(len(cluster_dict_organization))

249


# Model 2 - Organization data with Articles

In [61]:
organization_with_articles = addArticlesToData(organization_original)

organization_with_articles.to_csv(r'/Users/Rong/Documents/USF/EntityResolution/Model1/organizations/organizations_with_articles.csv', index = False)

## Model 2 - Output results Organization data with Articles

In [62]:
organization_with_articles_deduped = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/organizations/OrgWithArticles.csv")

organization_with_articles_deduped

Unnamed: 0,cluster_id,id,article_id,model,name,type,entity_id,paragraph,sentence,content
0,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45159555,423618,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,14,1,"A former University of Illinois doctoral student was convicted Monday of killing a visiting scholar from China after abducting her at a bus stop as she headed to sign an off-campus apartment lease.\nJurors deliberated less than 90 minutes at the federal death-penalty trial in Peoria, Ill.\nThe swift conviction was expected because Brendt Christensen's attorneys acknowledged from the start that he raped and stabbed Yingying Zhang in June 2017.\nProsecutors said he beat her to death with a baseball bat and decapitated her.\nJurors found Christensen guilty of kidnapping resulting in death, which carries a possible death sentence. Prosecutors are expected in the penalty phase to focus on Christensen's brutality, with the defense broaching mental health issues.\nThe judge has said there will be a break of a week or more before the penalty phase, a sort of mini-trial that could last several weeks.\nIllinois no longer has capital punishment, but he could be sentenced to death because he was convicted in federal court.\nThe federal death-penalty case is the first in Illinois since the state struck capital punishment from its books on grounds that death-penalty processes were too error-prone. Some Illinois anti-death penalty activists criticized what they said was the government's imposition of a death-penalty case on a non-death penalty state.\nThe defense began the trial with the rare admission that their client killed Zhang, but said they said they disagreed with prosecutors over how and why. The surprising strategy was a bid to start immediately trying to persuade jurors to spare Christensen's life.\nJurors heard evidence that Christensen boasted he killed 12 others before killing Zhang, starting when the Stevens Point, Wisconsin native was 19 and still living in Wisconsin. He began his studies in Champaign at the university's prestigious doctoral program in physics in 2013.\nHis lawyers said he made the claim about being a serial killer when he was drunk and that it was not true, but the FBI did not rule it out.\nChristensen, now 29, lured Zhang into his car posing as an undercover officer when she was running late to sign the apartment lease on June 9, 2017. The muscular Christensen forced the 5-foot-4 Zhang into his apartment in Urbana, Champaign's sister city 140 miles southwest of Chicago, where he raped and killed her.\nZhang was unlucky enough to be in the wrong place at the wrong time, prosecutors said, adding Christensen -- who had fantasized about killing -- determined to kill someone that day and had been cruising in his car looking for a victim. Earlier, he approached a different young woman posing as an officer, but she refused to get in the car.\nHe and his girlfriend, Terra Bullis, attended a vigil for Zhang on June 29, during which Bullis wore an FBI wire recording him detailing how he killed Zhang. As they left at night, she said she'd rather not call a ride-sharing service, telling him: ""My version of safer is walking at night with a serial killer."" He responds: ""Yeah. That's me.""\nChristensen was arrested on June 30, 2017, his birthday.\nChristensen sought help from mental-health counselors at the school for homicidal and suicidal thoughts in the months before Zhang vanished, according to his lawyers, who said his life was spinning out of control. In his first few semesters as a doctoral student, Christensen was making straight As but by late 2016, was getting Fs in all his classes.\nThere are more than 5,000 Chinese among the 45,000 students attending the University of Illinois in Champaign, one of the largest such enrollments in the nation.\nZhang had been in Illinois for just three months -- her only time living outside China. The daughter of working-class parents, she aspired to become a professor in crop sciences to help her family financially. Friends and family described her as caring and fun-loving.\nThe Associated Press contributed to this report. \n"
1,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45295165,424082,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,6,2,"House Intelligence Committee Ranking Member Devin Nunes has called on British ex-spy Christopher Steele to be interviewed to answer key questions on the anti-Trump dossier.\nNunes, R-Calif., said Monday on ""Tucker Carlson Tonight"" that Steele also must be interviewed by U.S. Attorney for Connecticut John Durham, whom Attorney General William Barr selected in April to examine the origins of the Russia investigation.\n""The Clinton campaign and Democrats were paying Christopher Steele to do this,"" he said, noting how the Clinton campaign and DNC, through the opposition research firm Fusion GPS, funded the dossier's creation.\n""We were never able to interview Christopher Steele. I think he needs to be interviewed by the Department of Justice and the U.S. Attorney in Connecticut.""\nRegarding Steele's work, host Tucker Carlson pointed out that Nunes sent a letter to the FBI requesting information about a 2016 meeting between the ex-MI6 spy and State Department official Kathleen Kavalec.\nKavalec had met with Steele and documented his political motivations in writing -- particularly that Steele's client was ""keen"" to see his anti-Trump materials ""come to light"" prior to the election. Kavalec forwarded her written notes, in which she also pointed out that some of Steele's claims apparently were false, to a senior FBI executive.\nOn that front, Nunes said he would give the FBI until Friday before he sends a criminal referral to the Justice Department, remarking that the bureau's failure to respond would be ""clearly obstructing our investigation.""\nNunes added that the British government also should reach out to the U.S. to assist in the matter.\n""I think the Brits have a lot of answers to give to the United States,"" he said.\n""The bottom line is, the guy was being paid by the Democrats. To do what? To dirty up Donald Trump and make it look like he had ties to Russians. The only one who had ties to Russians would be the Democrats themselves because Christopher Steele was supposedly talking to Russians.""\n"
2,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45295151,424082,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,5,1,"House Intelligence Committee Ranking Member Devin Nunes has called on British ex-spy Christopher Steele to be interviewed to answer key questions on the anti-Trump dossier.\nNunes, R-Calif., said Monday on ""Tucker Carlson Tonight"" that Steele also must be interviewed by U.S. Attorney for Connecticut John Durham, whom Attorney General William Barr selected in April to examine the origins of the Russia investigation.\n""The Clinton campaign and Democrats were paying Christopher Steele to do this,"" he said, noting how the Clinton campaign and DNC, through the opposition research firm Fusion GPS, funded the dossier's creation.\n""We were never able to interview Christopher Steele. I think he needs to be interviewed by the Department of Justice and the U.S. Attorney in Connecticut.""\nRegarding Steele's work, host Tucker Carlson pointed out that Nunes sent a letter to the FBI requesting information about a 2016 meeting between the ex-MI6 spy and State Department official Kathleen Kavalec.\nKavalec had met with Steele and documented his political motivations in writing -- particularly that Steele's client was ""keen"" to see his anti-Trump materials ""come to light"" prior to the election. Kavalec forwarded her written notes, in which she also pointed out that some of Steele's claims apparently were false, to a senior FBI executive.\nOn that front, Nunes said he would give the FBI until Friday before he sends a criminal referral to the Justice Department, remarking that the bureau's failure to respond would be ""clearly obstructing our investigation.""\nNunes added that the British government also should reach out to the U.S. to assist in the matter.\n""I think the Brits have a lot of answers to give to the United States,"" he said.\n""The bottom line is, the guy was being paid by the Democrats. To do what? To dirty up Donald Trump and make it look like he had ties to Russians. The only one who had ties to Russians would be the Democrats themselves because Christopher Steele was supposedly talking to Russians.""\n"
3,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45114434,423453,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,3,1,"A top Facebook executive said there's no evidence that Russia influenced the Brexit referendum result, prompting a backlash from some British lawmakers and critics of the tech giant.\nNick Clegg, the former British deputy prime minister who currently serves as the social network's head of global policy and communications, said the company's internal investigations did not find proof that misuse of the social network impacted the Brexit vote -- unlike when the company conducted a similar inquiry for the U.S. presidential election.\n""We ran two full analyses of all the data we have in the run-up to the Brexit referendum, following exactly the same methodology as we did after the FBI notified Facebook of outside interference in the 2016 U.S. presidential election,"" Clegg told BBC News on Monday. ""We've shared all this information with the select committee and Westminster and elsewhere. We have found no evidence of a significant attempt by outside forces.""\nClegg also said the ongoing backlash against Silicon Valley also created ""the risk that we throw the baby out with the bathwater and make it almost impossible for tech to innovate properly ... Technology is not good or bad. Technology down the ages is used by good and bad people for good and bad ends.""\nCarole Cadwalladr, an investigative journalist whose work is credited with exposing the Cambridge Analytica data scandal, dismissed Clegg's statements and retweeted the statements of others who believe that Facebook did play a role in influencing the Brexit referendum.\n""So Facebook spins propaganda. BBC platforms it. And LeaveEU amplifies it. A great morning's work, all. Well done. I honestly don't know where to start with this,"" Cadwalladr, who has called for an independent Robert Mueller-style probe into Brexit and written extensively on this topic, wrote on Twitter.\nA Channel 4 investigation in May found that millionnaire Arron Banks spent hundreds of thousands of dollars to fund the ""lavish lifestyle"" of Nigel Farage, a chief advocate of Brexit in the United Kingdom. Researchers and public figures have also said the Menlo Park, Calif., company hasn't been forthcoming enough with its own data, so academics can determine whether Russia played a role in Brexit.\nDavid Lammy, a British lawmaker with the Labour Party, also questioned Clegg's comments.\n""Horse manure,"" Lammy wrote on Twitter. ""What about the disinformation spread by Russian state media, RT and Sputnik, on Facebook?""\nCollins chairs a U.K. parliamentary committee that has called for Facebook to be investigated by the country's privacy and competition regulators.\nFacebook had no further comment when contacted by Fox News on Monday.\n"
4,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45286583,424079,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,6,2,"The federal prosecutor named by Attorney General William Barr to look into the origins of the Russia investigation will come to a substantive conclusion, according to Rep. Mark Meadows, R-N.C.\nConnecticut federal prosecutor John Durham is looking closely at the intelligence community's activity in late 2016 and early 2017, the House Freedom Caucus chairman claimed on ""Hannity.""\n""I can tell you that John Durham and Attorney General Barr are going to get to the bottom of it,"" he said.\n""They are including in part of their surveillance - really looking at the intelligence community to make sure that justice is brought.""\nRep. Jim Jordan, R-Ohio, who joined Meadows on ""Hannity,"" added he agreed with the North Carolina lawmaker and that a previous report by the Justice Department's inspector general was fruitful.\n""The attorney general of the United States and U.S. Attorney John Durham are doing an investigation,"" he said. ""They have told us this is broader than just the FBI - they're going to look at all of this.\n""First, Horowitz is going to come out. His report a year ago was very good,"" Jordan said, referring to inspector general Michael E. Horowitz.\n""Then we will see where Mr. Durham and Mr. Barr where their investigation and what they come back with.""\nAs part of its ongoing ""multifaceted"" and ""broad"" review into potential misconduct by U.S. intelligence agencies during the 2016 presidential campaign, the Justice Department revealed Monday it is also investigating the activities of several ""non-governmental organizations and individuals.""\nAdditionally, the DOJ announced that the probe, led by Durham, was looking into the involvement of ""foreign intelligence services.""\nFormer Trump aide George Papadopoulos told Fox News last month that an informant who was likely ""CIA and affiliated with Turkish intel"" had posed as a Cambridge University research assistant in September 2016 and tried to ""seduce him"" to obtain information linking the Trump team to Russia.\nThe information was contained in a letter to House Judiciary Committee Chairman Jerry Nadler, D-N.Y., who had inquired as to the scope of Durham's investigation.\nThe letter could indicate that the DOJ is looking closely at work done during the campaign by Fusion GPS, the firm retained by the Hillary Clinton campaign and Democratic National Committee (DNC) to conduct opposition research against the Trump campaign.\nFox News' Gregg Re contributed to this report.\n"
5,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45233277,423770,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,14,1,"The Dominican public health minister said in a radio interview Monday that the Maryland couple found dead last month in a hotel room at a luxury resort had pre-existing health problems and may have died after mixing prescription drugs and alcohol.\nIn response, the attorney for the couple's families called the minister's claim ""absolutely ludicrous.""\nPublic Health Minister Rafael Sanchez Cardenas noted more than once during the interview that the couple's May 30 deaths were extraordinary, given that they died almost ""simultaneously"" and that there was no sign of violence or foul play.\n""The toxicological test will determine if there's alcohol, and narcotics and the amount,"" Sanchez Cardenas said. ""There's been talk about drug abuse,"" he added, noting it could be lethal if combined with alcohol. He added that ""they had pre-existing health issues.""\nSanchez Cardenas said the bottom line was that there was nothing nefarious about the rash of the U.S. tourist deaths dogging the country since first making headlines in the spring.\n""Each one of these involved pre-existing health issues leading up to their deaths,"" he said. ""Every one of these cases can be explained. The autopsies show what happened.""\nIn a text message sent to Fox News, Steven Bullock, the attorney representing the Maryland couple's families, called the public health minister's remarks about the two tourists ""absolutely ludicrous."" \nEdward Holmes, 63, and Cynthia Day, 49, who were engaged, were found unresponsive in their room at the Grand Bahia Principe La Romana by a resort employee who went to check on them after they failed to check out. The couple, who had been at the resort since May 25, had plans to return to the United States the day they turned up dead.\nSeveral medications were found in the room, including an anti-inflammatory drug, an opioid and blood-pressure medicine, Dominican officials said at a news conference last Friday. Autopsies for many of the tourists showed pulmonary edema, an accumulation of fluid in the lungs frequently triggered by heart disease. Among nearly a dozen U.S. tourists who have died in the country in the last 18 months, Dominican investigators said most died of a heart attack.\nAutopsies for Day and Holmes showed they had enlarged hearts, internal bleeding and pulmonary edema. Day was said to have fluid in the brain. On the radio show, the health minister said that Holmes was ""morbidly obese.""\nFBI officials have been conducting toxicological tests in their Virginia research center on blood samples from the couple, as well as from a Pennsylvania woman, Miranda Schaup-Werner, who died at the same resort complex five days before. Schaup-Werner's relatives said she collapsed after she had a drink from the minibar. Her autopsy stated she had a heart attack.\nThe news of Holmes' and Day's deaths, made public by their families when they went to the media with concerns about investigators' preliminary determination that the two died of natural causes, prompted friends and relatives of other U.S. tourists who died in other Dominican resorts to come forward, sharing their suspicions and bewilderment about what killed their loved ones.\nMany of the families described their deceased relatives as having been in generally good health right before traveling to the Dominican Republic. They have expressed outrage over what they saw as a concerted effort by Dominican officials to pin the deaths on the people who died. Some families have been arranging for their own autopsies and toxicological tests in the U.S.\nThe FBI told Fox News last week that the bureau sent a team to the Dominican Republic to help investigate the deaths.\nAmong the questions that relatives and some U.S. public health and epidemiological experts have raised: whether at least some of the deaths might have been caused by counterfeit alcohol or by pesticides or insect-killer chemicals that somehow wound up on drinking glasses or utensils.\nAs Sanchez Cardenas was wrapping up the radio interview Monday morning, friends and family of Cynthia Day were gathering at the First Baptist Church of Glenarden in Maryland for her memorial service. Holmes' funeral has been scheduled for Wednesday.\nDominican officials often have emphasized the presence of several prescription medications in the room where Day and Holmes were staying, but only in the last week started saying outright that the meds seemed to have played a role in their deaths.\nIn the morning radio interview, Sanchez Cardenas said the couple had a practical ""pharmacy"" in their room. Last week, Carlos Suero, the spokesman for the Ministry of Public Health, told Fox News in a wide-ranging phone interview that the coverage of the rash of deaths in the popular Caribbean vacation spot as ""mysterious"" was nothing but fake news. Suero said that in a competitive industry such as tourism, there were people who would try to undermine a top destination such as the Dominican Republic.\nSuero laid out many of the arguments and scenarios that Sanchez Cardenas underscored in the radio interview, speaking about the health issues that some of the tourists allegedly had, and saying that perhaps it wasn't responsible to travel with such health problems.\nSuero told Fox News that Holmes died first and Cox died afterward, saying that the shock of seeing Holmes dead next to her could have killed her.\nAt the funeral service, Bullock told reporters he did not buy the account by Dominican investigators about the cause and circumstances of the deaths of Day and Holmes.\n""It's a mystery,"" Bullock told WTOP. ""There's reason for us to pause, and we're going to investigate this and get this matter resolved.""\nBullock said he was focused on getting more information from U.S. officials and results of toxicology tests before jumping to conclusions.\n""We need to find out what's going on and what happened,"" said Meshonn Madison, Day's friend.\nSanchez Cardenas also singled out the deaths of Joseph Allen, 55, of New Jersey, and Leyla Cox, 53, of New York. He said Allen had unhealthful habits such as smoking and drinking regularly, and a report on his death referred to him as ""a ticking timebomb,"" adding that ""his organs were practically destroyed, with a biological age of more than 80 years old. He was extremely obese, weighing more than 400 pounds.""\nAllen's family has disputed those conclusions.\nThe health minister said Cox's autopsy showed she previously had suffered several heart attacks. However, her family and her former supervisor at the New York hospital where she worked as an MRI technician said Cox had never suffered a heart attack.\nHer son, Will Cox, lashed out at Dominican investigators as untrustworthy and said they repeated had put up roadblocks when he tried to get answers and that they tried to rush him into letting them cremate or embalm her. With the intervention of the U.S. Embassy and congressional lawmakers, Cox succeeded in getting Dominican officials to agree to send a vial of his mother's blood to the U.S., where the hospital where she worked would run toxicology tests.\n""We're not talking about a patient who had no medical conditions,"" the health minister said of Leyla Cox in the radio interview. ""A person with hypertension is vulnerable to a heart attack.""\n"
6,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45159528,423618,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,11,1,"A former University of Illinois doctoral student was convicted Monday of killing a visiting scholar from China after abducting her at a bus stop as she headed to sign an off-campus apartment lease.\nJurors deliberated less than 90 minutes at the federal death-penalty trial in Peoria, Ill.\nThe swift conviction was expected because Brendt Christensen's attorneys acknowledged from the start that he raped and stabbed Yingying Zhang in June 2017.\nProsecutors said he beat her to death with a baseball bat and decapitated her.\nJurors found Christensen guilty of kidnapping resulting in death, which carries a possible death sentence. Prosecutors are expected in the penalty phase to focus on Christensen's brutality, with the defense broaching mental health issues.\nThe judge has said there will be a break of a week or more before the penalty phase, a sort of mini-trial that could last several weeks.\nIllinois no longer has capital punishment, but he could be sentenced to death because he was convicted in federal court.\nThe federal death-penalty case is the first in Illinois since the state struck capital punishment from its books on grounds that death-penalty processes were too error-prone. Some Illinois anti-death penalty activists criticized what they said was the government's imposition of a death-penalty case on a non-death penalty state.\nThe defense began the trial with the rare admission that their client killed Zhang, but said they said they disagreed with prosecutors over how and why. The surprising strategy was a bid to start immediately trying to persuade jurors to spare Christensen's life.\nJurors heard evidence that Christensen boasted he killed 12 others before killing Zhang, starting when the Stevens Point, Wisconsin native was 19 and still living in Wisconsin. He began his studies in Champaign at the university's prestigious doctoral program in physics in 2013.\nHis lawyers said he made the claim about being a serial killer when he was drunk and that it was not true, but the FBI did not rule it out.\nChristensen, now 29, lured Zhang into his car posing as an undercover officer when she was running late to sign the apartment lease on June 9, 2017. The muscular Christensen forced the 5-foot-4 Zhang into his apartment in Urbana, Champaign's sister city 140 miles southwest of Chicago, where he raped and killed her.\nZhang was unlucky enough to be in the wrong place at the wrong time, prosecutors said, adding Christensen -- who had fantasized about killing -- determined to kill someone that day and had been cruising in his car looking for a victim. Earlier, he approached a different young woman posing as an officer, but she refused to get in the car.\nHe and his girlfriend, Terra Bullis, attended a vigil for Zhang on June 29, during which Bullis wore an FBI wire recording him detailing how he killed Zhang. As they left at night, she said she'd rather not call a ride-sharing service, telling him: ""My version of safer is walking at night with a serial killer."" He responds: ""Yeah. That's me.""\nChristensen was arrested on June 30, 2017, his birthday.\nChristensen sought help from mental-health counselors at the school for homicidal and suicidal thoughts in the months before Zhang vanished, according to his lawyers, who said his life was spinning out of control. In his first few semesters as a doctoral student, Christensen was making straight As but by late 2016, was getting Fs in all his classes.\nThere are more than 5,000 Chinese among the 45,000 students attending the University of Illinois in Champaign, one of the largest such enrollments in the nation.\nZhang had been in Illinois for just three months -- her only time living outside China. The daughter of working-class parents, she aspired to become a professor in crop sciences to help her family financially. Friends and family described her as caring and fun-loving.\nThe Associated Press contributed to this report. \n"
7,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45233261,423770,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,11,1,"The Dominican public health minister said in a radio interview Monday that the Maryland couple found dead last month in a hotel room at a luxury resort had pre-existing health problems and may have died after mixing prescription drugs and alcohol.\nIn response, the attorney for the couple's families called the minister's claim ""absolutely ludicrous.""\nPublic Health Minister Rafael Sanchez Cardenas noted more than once during the interview that the couple's May 30 deaths were extraordinary, given that they died almost ""simultaneously"" and that there was no sign of violence or foul play.\n""The toxicological test will determine if there's alcohol, and narcotics and the amount,"" Sanchez Cardenas said. ""There's been talk about drug abuse,"" he added, noting it could be lethal if combined with alcohol. He added that ""they had pre-existing health issues.""\nSanchez Cardenas said the bottom line was that there was nothing nefarious about the rash of the U.S. tourist deaths dogging the country since first making headlines in the spring.\n""Each one of these involved pre-existing health issues leading up to their deaths,"" he said. ""Every one of these cases can be explained. The autopsies show what happened.""\nIn a text message sent to Fox News, Steven Bullock, the attorney representing the Maryland couple's families, called the public health minister's remarks about the two tourists ""absolutely ludicrous."" \nEdward Holmes, 63, and Cynthia Day, 49, who were engaged, were found unresponsive in their room at the Grand Bahia Principe La Romana by a resort employee who went to check on them after they failed to check out. The couple, who had been at the resort since May 25, had plans to return to the United States the day they turned up dead.\nSeveral medications were found in the room, including an anti-inflammatory drug, an opioid and blood-pressure medicine, Dominican officials said at a news conference last Friday. Autopsies for many of the tourists showed pulmonary edema, an accumulation of fluid in the lungs frequently triggered by heart disease. Among nearly a dozen U.S. tourists who have died in the country in the last 18 months, Dominican investigators said most died of a heart attack.\nAutopsies for Day and Holmes showed they had enlarged hearts, internal bleeding and pulmonary edema. Day was said to have fluid in the brain. On the radio show, the health minister said that Holmes was ""morbidly obese.""\nFBI officials have been conducting toxicological tests in their Virginia research center on blood samples from the couple, as well as from a Pennsylvania woman, Miranda Schaup-Werner, who died at the same resort complex five days before. Schaup-Werner's relatives said she collapsed after she had a drink from the minibar. Her autopsy stated she had a heart attack.\nThe news of Holmes' and Day's deaths, made public by their families when they went to the media with concerns about investigators' preliminary determination that the two died of natural causes, prompted friends and relatives of other U.S. tourists who died in other Dominican resorts to come forward, sharing their suspicions and bewilderment about what killed their loved ones.\nMany of the families described their deceased relatives as having been in generally good health right before traveling to the Dominican Republic. They have expressed outrage over what they saw as a concerted effort by Dominican officials to pin the deaths on the people who died. Some families have been arranging for their own autopsies and toxicological tests in the U.S.\nThe FBI told Fox News last week that the bureau sent a team to the Dominican Republic to help investigate the deaths.\nAmong the questions that relatives and some U.S. public health and epidemiological experts have raised: whether at least some of the deaths might have been caused by counterfeit alcohol or by pesticides or insect-killer chemicals that somehow wound up on drinking glasses or utensils.\nAs Sanchez Cardenas was wrapping up the radio interview Monday morning, friends and family of Cynthia Day were gathering at the First Baptist Church of Glenarden in Maryland for her memorial service. Holmes' funeral has been scheduled for Wednesday.\nDominican officials often have emphasized the presence of several prescription medications in the room where Day and Holmes were staying, but only in the last week started saying outright that the meds seemed to have played a role in their deaths.\nIn the morning radio interview, Sanchez Cardenas said the couple had a practical ""pharmacy"" in their room. Last week, Carlos Suero, the spokesman for the Ministry of Public Health, told Fox News in a wide-ranging phone interview that the coverage of the rash of deaths in the popular Caribbean vacation spot as ""mysterious"" was nothing but fake news. Suero said that in a competitive industry such as tourism, there were people who would try to undermine a top destination such as the Dominican Republic.\nSuero laid out many of the arguments and scenarios that Sanchez Cardenas underscored in the radio interview, speaking about the health issues that some of the tourists allegedly had, and saying that perhaps it wasn't responsible to travel with such health problems.\nSuero told Fox News that Holmes died first and Cox died afterward, saying that the shock of seeing Holmes dead next to her could have killed her.\nAt the funeral service, Bullock told reporters he did not buy the account by Dominican investigators about the cause and circumstances of the deaths of Day and Holmes.\n""It's a mystery,"" Bullock told WTOP. ""There's reason for us to pause, and we're going to investigate this and get this matter resolved.""\nBullock said he was focused on getting more information from U.S. officials and results of toxicology tests before jumping to conclusions.\n""We need to find out what's going on and what happened,"" said Meshonn Madison, Day's friend.\nSanchez Cardenas also singled out the deaths of Joseph Allen, 55, of New Jersey, and Leyla Cox, 53, of New York. He said Allen had unhealthful habits such as smoking and drinking regularly, and a report on his death referred to him as ""a ticking timebomb,"" adding that ""his organs were practically destroyed, with a biological age of more than 80 years old. He was extremely obese, weighing more than 400 pounds.""\nAllen's family has disputed those conclusions.\nThe health minister said Cox's autopsy showed she previously had suffered several heart attacks. However, her family and her former supervisor at the New York hospital where she worked as an MRI technician said Cox had never suffered a heart attack.\nHer son, Will Cox, lashed out at Dominican investigators as untrustworthy and said they repeated had put up roadblocks when he tried to get answers and that they tried to rush him into letting them cremate or embalm her. With the intervention of the U.S. Embassy and congressional lawmakers, Cox succeeded in getting Dominican officials to agree to send a vial of his mother's blood to the U.S., where the hospital where she worked would run toxicology tests.\n""We're not talking about a patient who had no medical conditions,"" the health minister said of Leyla Cox in the radio interview. ""A person with hypertension is vulnerable to a heart attack.""\n"
8,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45300590,424086,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,8,2,"President Trump on Monday refused to say whether he had confidence in FBI Director Christopher Wray, while acknowledging the two officials have disagreed on some key issues, including whether the president's campaign was a victim of spying.\nIn an interview, Trump was quizzed on his level of confidence in the FBI boss.\n""Well, we'll see how it turns out,"" he told The Hill, before discussing Wray's previous claim that he would not use the word ""spying"" to describe the bureau's surveillance of figures linked to the Trump campaign in 2016.\n""I mean, I disagree with him on that and I think a lot of people are disagreeing. You may even disagree with him on that.""\nThe comments weren't the first the president has made against Wray, with Trump also taking a swipe at his own FBI director last month.\n""'The FBI has no leadership,'"" Trump quoted Judicial Watch president Tom Fitton as saying. ""'The Director is protecting the same gang that tried to overthrow the President through an illegal coup.' (Recommended by previous DOJ) @TomFitton @JudicialWatch.""\nWhen asked earlier this year if he believed the Trump campaign was spied on in 2016, Wray told lawmakers on the Senate Appropriations Committee: ""That's not the term I would use.\n""Lots of people have different colloquial phrases. I believe that the FBI is engaged in investigative activity, and part of investigative activity includes surveillance activity of different shapes and sizes, and to me the key question is making sure that it's done by the book, consistent with our lawful authorities.""\n"
9,01b0a8ee-8587-4442-b116-83e9b5ef2db2,45300578,424086,english.all.3class.distsim.crf.ser,FBI,ORGANIZATION,156,6,1,"President Trump on Monday refused to say whether he had confidence in FBI Director Christopher Wray, while acknowledging the two officials have disagreed on some key issues, including whether the president's campaign was a victim of spying.\nIn an interview, Trump was quizzed on his level of confidence in the FBI boss.\n""Well, we'll see how it turns out,"" he told The Hill, before discussing Wray's previous claim that he would not use the word ""spying"" to describe the bureau's surveillance of figures linked to the Trump campaign in 2016.\n""I mean, I disagree with him on that and I think a lot of people are disagreeing. You may even disagree with him on that.""\nThe comments weren't the first the president has made against Wray, with Trump also taking a swipe at his own FBI director last month.\n""'The FBI has no leadership,'"" Trump quoted Judicial Watch president Tom Fitton as saying. ""'The Director is protecting the same gang that tried to overthrow the President through an illegal coup.' (Recommended by previous DOJ) @TomFitton @JudicialWatch.""\nWhen asked earlier this year if he believed the Trump campaign was spied on in 2016, Wray told lawmakers on the Senate Appropriations Committee: ""That's not the term I would use.\n""Lots of people have different colloquial phrases. I believe that the FBI is engaged in investigative activity, and part of investigative activity includes surveillance activity of different shapes and sizes, and to me the key question is making sure that it's done by the book, consistent with our lawful authorities.""\n"


# Model 2 - Applying Probable People in preprocessing with Articles

- People 
- Location 
- Organization

probablepeople is a python library for parsing unstructured western name strings into components, using conditional random fields.

This parser is even able to handle couples and company names, since they're often mixed with person names in real world datasets.

probablepeople has the following labels for parsing names & companies:

PrefixMarital
PrefixOther
GivenName
FirstInitial
MiddleName
MiddleInitial
Surname
LastInitial
SuffixGenerational
SuffixOther
Nickname
And
CorporationName
CorporationNameOrganization
CorporationLegalType
CorporationNamePossessiveOf
ShortForm
ProxyFor
AKA

https://probablepeople.readthedocs.io/en/latest/
https://datamade.us/blog/parse-name-or-parse-anything-really/

What this can do: Using a probabilistic model, it makes (very educated) guesses in identifying name or corporation components, even in tricky cases where rule-based parsers typically break down.


## Idea 
By breaking up our names into multiple tokens in our preprocessing stage, this may increase the output accuracy of our results when working with the dedup library.

In [64]:
import probablepeople as pp

In [65]:
# For every name in the list 
# split into first name and last name 

# create firstname and lastname columns in the dataframe 
# add all values into the dataframe 

# Run the new data in dedupe 

In [66]:
def probableParser(inputDataFrame):
    
    result = []
    
# The tag method will try to be a little smarter
# it will merge consecutive components, strip commas, & return a string type

    for row in inputDataFrame:
        parsed = pp.tag(row)
        result.append(parsed)
    
    return result
    

## People data

In [79]:
names = new_entity_people['name']

print(len(names))
split_names = probableParser(names)

for i in range(10):
    print(i, split_names[i])
    
print(type(split_names[0]))

995
0 (OrderedDict([('PrefixMarital', 'Mrs'), ('Surname', 'Webster')]), 'Person')
1 (OrderedDict([('PrefixMarital', 'Mrs'), ('Surname', 'Webster')]), 'Person')
2 (OrderedDict([('GivenName', 'Trina'), ('Surname', 'Webster')]), 'Person')
3 (OrderedDict([('GivenName', 'Daniel'), ('Surname', 'Webster')]), 'Person')
4 (OrderedDict([('Surname', 'Trump')]), 'Person')
5 (OrderedDict([('GivenName', 'Melania'), ('Surname', 'Trump')]), 'Person')
6 (OrderedDict([('Surname', 'Trump')]), 'Person')
7 (OrderedDict([('Surname', 'Trump')]), 'Person')
8 (OrderedDict([('Surname', 'Trump')]), 'Person')
9 (OrderedDict([('Surname', 'Trump')]), 'Person')
<class 'tuple'>


In [121]:
# print(entity_people)
new_entity_people = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/persons/people_articles_results.csv")
new_entity_people['firstname'] = ''
new_entity_people['lastname'] = ''


new_entity_people['name'].head(10)
if len(new_entity_people) == len(split_names):
    print("true")
    

print(split_names[4])
print(split_names[5])
# new_entity_people.iloc[[i]]

true
(OrderedDict([('Surname', 'Trump')]), 'Person')
(OrderedDict([('GivenName', 'Melania'), ('Surname', 'Trump')]), 'Person')


In [122]:
# currRow = entity_people.iloc[[0]]
# print(currRow)
givenName = 'GivenName'
surName = 'Surname'
i = 0


while i < len(split_names):
    currList = split_names[i]
#     print(new_entity_people.iloc[[i]])
    
    tuples = currList[0]
#     print(tuples.items())
    
#     print("\nNew tuples:", tuples)
    if givenName in tuples.keys():
#         print("to insert first:", tuples[givenName])
#         print("to accept:", new_entity_people.iloc[[i]]['firstname'])
        if i < 5:
            print(i)
        new_entity_people.at[i, 'firstname'] = tuples[givenName]
#         new_entity_people.iloc[[i]]['firstname'] = tuples[givenName]
    if surName in tuples.keys():
#         print(tuples[surName])
#         print("to insert last:", tuples[surName])
#         print(i)
#         print( new_entity_people.at[i+1, 'lastname'])
        new_entity_people.at[i, 'lastname'] = tuples[surName]
#         new_entity_people.iloc[[i]]['lastname'] = tuples[surName]

#     print(new_entity_people.iloc[[i]])
    i = i + 1


2
3


In [130]:
print(new_entity_people[['name', 'firstname', 'lastname']])


new_entity_people.to_csv(r'/Users/Rong/Documents/USF/EntityResolution/Model1/persons/persons_probablepeople.csv', index = False)

                        name      firstname       lastname
0    Mrs Webster                             Webster      
1    Mrs Webster                             Webster      
2    Trina Webster            Trina          Webster      
3    Daniel Webster           Daniel         Webster      
4    Trump                                   Trump        
5    Melania Trump            Melania        Trump        
6    Trump                                   Trump        
7    Trump                                   Trump        
8    Trump                                   Trump        
9    Trump                                   Trump        
10   Trump                                   Trump        
11   Trump                                   Trump        
12   Trump                                   Trump        
13   Trump                                   Trump        
14   Trump                                   Trump        
15   Trump                                   Trump      

## Run model

In [133]:
[
{
"crf": True,
"type": "Name",
"field": "abs_firstname",
"log file": "/tmp/name.csv",
"has_missing": True
},
{
"crf": True,
"type": "Name",
"field": "abs_lastname",
"log file": "/tmp/name.csv",
"has_missing": True
},
{
"type": "Text",
"field": "abs_content",
"corpus": []
},
{
"type": "PositiveNumber",
"field": "abs_paragraph"
},
{
"type": "PositiveNumber",
"field": "abs_article_id"
}
]

[{'crf': True,
  'field': 'abs_firstname',
  'has_missing': True,
  'log file': '/tmp/name.csv',
  'type': 'Name'},
 {'crf': True,
  'field': 'abs_lastname',
  'has_missing': True,
  'log file': '/tmp/name.csv',
  'type': 'Name'},
 {'corpus': [], 'field': 'abs_content', 'type': 'Text'},
 {'field': 'abs_paragraph', 'type': 'PositiveNumber'},
 {'field': 'abs_article_id', 'type': 'PositiveNumber'}]

In [None]:
probable_people_deduped = pd.read_csv("/Users/Rong/Documents/USF/EntityResolution/Model1/people_articles_results.csv")




## Location data

In [150]:
locations = location_original['name']

split_locations = probableParser(locations)

for i in range(10):
    print(split_locations[i])

(OrderedDict([('Surname', 'Broadway')]), 'Person')
(OrderedDict([('Surname', 'Oklahoma')]), 'Person')
(OrderedDict([('CorporationName', 'Square Theater')]), 'Corporation')
(OrderedDict([('CorporationName', 'United States of America')]), 'Corporation')
(OrderedDict([('Surname', 'Broadway')]), 'Person')
(OrderedDict([('Surname', 'Oklahoma')]), 'Person')
(OrderedDict([('Surname', 'London')]), 'Person')
(OrderedDict([('Surname', 'Oklahoma')]), 'Person')
(OrderedDict([('Surname', 'Laurey')]), 'Person')
(OrderedDict([('Surname', 'Oklahoma')]), 'Person')


## Organization data

In [151]:
organizations = organization_original['name']

split_organizations = probableParser(organizations)

for i in range(10):
    print(split_organizations[i])

(OrderedDict([('CorporationNameOrganization', 'Bard College')]), 'Person')
(OrderedDict([('CorporationName', "St. Ann 's Warehouse")]), 'Corporation')
(OrderedDict([('CorporationName', 'Fox News First')]), 'Corporation')
(OrderedDict([('CorporationName', 'Fox News')]), 'Corporation')
(OrderedDict([('CorporationName', 'American Legion')]), 'Corporation')
(OrderedDict([('GivenName', 'Supreme'), ('Surname', 'Court')]), 'Person')
(OrderedDict([('CorporationName', 'New Orleans Pelicans')]), 'Corporation')
(OrderedDict([('Surname', 'Duke')]), 'Person')
(OrderedDict([('ShortForm', 'NBA')]), 'Corporation')
(OrderedDict([('CorporationName', 'White House')]), 'Corporation')
