# Trump
**Uffaz Nathaniel (unathan1)**

----

### Imports

In [116]:
%matplotlib inline

In [117]:
import pandas as pd
import numpy as np
import networkx as nx

import matplotlib.pyplot as plt

### Read Trump World Dataset

In [118]:
G = nx.read_graphml('./data/trumpworld.graphml')
df = pd.read_csv('./data/trumpworld.csv')
df.head()

Unnamed: 0,Entity A Type,Entity A,Entity B Type,Entity B,Connection,Source(s)
0,Organization,4 SHADOW TREE LANE MEMBER CORP.,Organization,4 SHADOW TREE LANE LLC,Ownership,https://www.documentcloud.org/documents/283869...
1,Organization,40 WALL DEVELOPMENT ASSOCIATES LLC,Organization,40 WALL STREET LLC,Ownership,https://www.documentcloud.org/documents/283869...
2,Organization,40 WALL STREET LLC,Organization,40 WALL STREET COMMERCIAL LLC,Ownership,https://www.documentcloud.org/documents/283869...
3,Organization,40 WALL STREET MEMBER CORP.,Organization,40 WALL STREET LLC,Ownership,https://www.documentcloud.org/documents/283869...
4,Organization,401 MEZZ VENTURE LLC,Organization,401 NORTH WABASH VENTURE LLC,Ownership,https://www.documentcloud.org/documents/283869...


In [159]:
for e in r.edges('NORTHERN TRUST'):
    source,target = e
    print(source)
    print(target)
    print('----')

NORTHERN TRUST
40 WALL STREET LLC
----
NORTHERN TRUST
TRUMP TOWER COMMERCIAL LLC
----


In [255]:
def get_node_type(n):
    _, entity_type = get_id(str(n))
    return '' if entity_type is None else entity_type.lower()
    
# Rewire all of target's edges to source
def rewire(G, source, target):
    # Results
    pending_remove = []
    pending_add = []
    pending_nodes = []
    
    # Source
    source_node_type = get_node_type(source)
    if source_node_type != 'person':
        pending_nodes.append(source)
        source_edges = G.edges(source, data=True)
        person_nodes = []
        for e in source_edges:
            s2, t2, d2 = e
            if get_node_type(t2) == 'person':
                person_nodes.append(t2)
        # wire them
        for i in range(len(person_nodes)):
            for j in range(len(person_nodes)):
                if i != j:
                    pending_add.append((person_nodes[i], person_nodes[j]))
    
    # Target
    # Only for nodes that are not of type person
    if get_node_type(target) != 'person':
        # List of edges to add
        pending_nodes.append(target)

        # Iterate through all the edges and rewire 
        # them to source
        target_edges = G.edges(target, data=True)
        for e in target_edges:
            s2, t2, d2 = e
            if e == source:
                continue
            # Delegate otherwise networkx throws 'dictionary changed size during iteration'
            pending_remove.append((s2, t2))
            if source_node_type == 'person' and source != t2:
                pending_add.append((source, t2))
            #G.remove_edge(s2, t2)
            #G.add_edge(source, t2)
    
    return pending_remove, pending_add, pending_nodes


def process_pending_tasks(G, pending_tasks, nodes_to_remove):
    # Remove and add edges
    for task in pending_tasks:
        if task is None:
            continue
        pending_remove, pending_add, pending_nodes = task

        # Remove
        for s2, t2 in pending_remove:
            try:
                G.remove_edge(s2, t2)
            except:
                # Edge already remove
                pass

        # Add
        for u, v in pending_add:
            G.add_edge(u, v)

        for pn in pending_nodes:
            nodes_to_remove.add(pn)


def reduce(G):
    G = G.copy()
    
    # Safety check
    if len(G.nodes) == 0:
        return G
    
    # Mark all the vertices as not visited 
    visited = {}
    for n in G.nodes:
        visited[n] = False
        
    nodes_to_remove = set()
    
    # Run BFS.
    # This loop ensures all nodes are visited where
    # in traditional BFS, we supply a starting node
    for n in G.nodes:
        
        # See if the node is already visited and
        # if so, skip running BFS
        if visited[n]:
            continue
        
        # Create a queue for BFS 
        queue = []

        # Mark the source node as  
        # visited and enqueue it 
        queue.append(n)
        #apply_reduction(G, n) # !!!!! IMPORTANT - iterate over all edges
        pending_tasks = []
        for e in G.edges(n, data=True):
            source, target, data = e
            if visited[target] == False:
                pending_tasks.append(rewire(G, source, target))
        #process_pending_tasks(G, pending_tasks, nodes_to_remove)
        
        visited[n] = True

        while queue: 

            # Dequeue a vertex from queue
            s = queue.pop(0)
            
            pending_tasks = []

            # Get all adjacent vertices of the 
            # dequeued vertex s. If a adjacent 
            # has not been visited, then mark it 
            # visited and enqueue it 
            for e in G.edges(s, data=True):
                source, target, data = e
                if visited[target] == False:
                    queue.append(target)
                    pending_tasks.append(rewire(G, source, target))
                    visited[target] = True
                    
            
            process_pending_tasks(G, pending_tasks, nodes_to_remove)    
                    
    G.remove_nodes_from(list(nx.isolates(G)))
    for pn in nodes_to_remove:
        try:
            G.remove_node(pn)
        except:
            pass
    
    return G


r = reduce(G)
r = reduce(r)
print('nodes =', len(G.nodes))
print('edges =', len(G.edges))
print('----')
print('nodes =', len(r.nodes))
print('edges =', len(r.edges))

nx.write_graphml(r, './reduced.graphml')

nodes = 2669
edges = 3380
----
nodes = 637
edges = 845


In [254]:
nx.diameter(G)

13

In [260]:
r = max(nx.connected_component_subgraphs(r), key=len)

In [259]:
nx.diameter(r)

7

Assign each entity a unique id

In [119]:
print(df['Entity A Type'].unique())
print(df['Entity B Type'].unique())

['Organization' 'Person']
['Organization' 'Person' 'Federal Agency']


In [144]:
df[df['Entity B Type'] == 'Federal Agency']

Unnamed: 0,Entity A Type,Entity A,Entity B Type,Entity B,Connection,Source(s)
153,Person,BEN CARSON,Federal Agency,DEPARTMENT OF HOUSING AND URBAN DEVELOPMENT,Secretary,General knowledge
179,Person,BETSY DEVOS,Federal Agency,DEPARTMENT OF EDUCATION,Secretary,General knowledge
1363,Person,ELAINE CHAO,Federal Agency,DEPARTMENT OF TRANSPORTATION,Secretary,General knowledge
1621,Person,JAMES MATTIS,Federal Agency,DEPARTMENT OF DEFENSE,Secretary,General knowledge
1688,Person,JEFF SESSIONS,Federal Agency,DEPARTMENT OF JUSTICE,Attorney General,General knowledge
1716,Person,JOHN F. KELLY,Federal Agency,DEPARTMENT OF HOMELAND SECURITY,Secretary,General knowledge
1721,Person,JOHN GORE,Federal Agency,DEPARTMENT OF JUSTICE,Deputy assistant AG for Civil Rights,http://www.jonesday.com/jones-day-lawyers-tapp...
1890,Person,LINDA MCMAHON,Federal Agency,SMALL BUSINESS ADMINISTRATION,Administrator,General knowledge
2060,Person,MICHAEL POMPEO,Federal Agency,CENTRAL INTELLIGENCE AGENCY,Director,General knowledge
2403,Person,RICK PERRY,Federal Agency,DEPARTMENT OF ENERGY,Secretary,General knowledge


In [120]:
_id_organizations = {}
_id_persons = {}
_id_federal_agency = {}

id_counter_orgs = -1
id_counter_persons = -1
id_counter_fa = -1

for index, row in df.iterrows():
    entity1       = row['Entity A'].upper()
    entity_type_1 = row['Entity A Type'].upper()
    entity2       = row['Entity B'].upper()
    entity_type_2 = row['Entity B Type'].upper()
    
    if entity_type_1 == 'ORGANIZATION':
        if entity1 not in _id_organizations:
            id_counter_orgs = id_counter_orgs + 1
            _id_organizations[entity1] = id_counter_orgs
    elif entity_type_1 == 'PERSON':
        if entity1 not in _id_persons:
            id_counter_persons = id_counter_persons + 1
            _id_persons[entity1] = id_counter_persons
    elif entity_type_1 == 'FEDERAL AGENCY':
        if entity1 not in _id_federal_agency:
            id_counter_fa = id_counter_fa + 1
            _id_federal_agency[entity1] = id_counter_fa
    else:
        print("Unknown entity 1")
            
    if entity_type_2 == 'ORGANIZATION':
        if entity2 not in _id_organizations:
            id_counter_orgs = id_counter_orgs + 1
            _id_organizations[entity2] = id_counter_orgs
    elif entity_type_2 == 'PERSON':
        if entity2 not in _id_persons:
            id_counter_persons = id_counter_persons + 1
            _id_persons[entity2] = id_counter_persons
    elif entity_type_2 == 'FEDERAL AGENCY':
        if entity2 not in _id_federal_agency:
            id_counter_fa = id_counter_fa + 1
            _id_federal_agency[entity2] = id_counter_fa
    else:
        print("Unknown entity 2")
    

def get_id_by_org(name):
    name = name.upper()
    return _id_organizations.get(name, None)

def get_id_by_person(name):
    name = name.upper()
    return _id_persons.get(name, None)

def get_id_by_fa(name):
    name = name.upper()
    return _id_federal_agency.get(name, None)

def get_id(name):
    name = name.upper()
    id = get_id_by_org(name)
    if id is not None:
        return id, 'ORGANIZATION'
    id = get_id_by_person(name)
    if id is not None:
        return id, 'PERSON'
    if id is not None:
        return id, 'FEDERAL AGENCY'
    return (None, None)
    

print("Number of organizations =", len(_id_organizations))
print("Number of persons =", len(_id_persons))
print("Number of federal agencies =", len(_id_federal_agency))

Number of organizations = 2015
Number of persons = 640
Number of federal agencies = 14


### Representing relationships as adjaceny matricies

- `A` = Person x Person
- `B` = Organization x Organization
- `C` = Federal Agency x Federal Agency
- `D` = Person x Organization
- `E` = Organization x Federal Agency
- `F` - Person x Federal Agency

In [121]:
A = np.zeros(shape=(len(_id_persons), len(_id_persons)))
B = np.zeros(shape=(len(_id_organizations), len(_id_organizations)))
C = np.zeros(shape=(len(_id_federal_agency), len(_id_federal_agency)))
D = np.zeros(shape=(len(_id_persons), len(_id_organizations)))
E = np.zeros(shape=(len(_id_organizations), len(_id_federal_agency)))
F = np.zeros(shape=(len(_id_persons), len(_id_federal_agency)))

# set values to 1
for index, row in df.iterrows():
    entity1       = row['Entity A'].upper()
    entity_type_1 = row['Entity A Type'].upper()
    entity2       = row['Entity B'].upper()
    entity_type_2 = row['Entity B Type'].upper()
    
    id1, _ = get_id(entity1)
    id2, _ = get_id(entity2)
    
    if entity_type_1 == 'PERSON' and entity_type_2 == 'PERSON':
        A[id1][id2] = 1
        A[id2][id1] = 1
    elif entity_type_1 == 'ORGANIZATION' and entity_type_2 == 'ORGANIZATION':
        B[id1][id2] = 1
        B[id2][id1] = 1
    elif entity_type_1 == 'FEDERAL AGENCY' and entity_type_2 == 'FEDERAL AGENCY':
        C[id1][id2] = 1
        C[id2][id1] = 1
    elif (entity_type_1 == 'PERSON' and entity_type_2 == 'ORGANIZATION'):
        D[id1][id2] = 1
    elif (entity_type_1 == 'ORGANIZATION' and entity_type_2 == 'PERSON'):
        D[id2][id1] = 1
    elif (entity_type_1 == 'ORGANIZATION' and entity_type_2 == 'FEDERAL AGENCY'):
        E[id1][id2] = 1
    elif (entity_type_1 == 'FEDERAL AGENCY' and entity_type_2 == 'ORGANIZATION'):
        E[id2][id1] = 1
    elif (entity_type_1 == 'PERSON' and entity_type_2 == 'ORGANIZATION'):
        F[id1][id2] = 1
    elif (entity_type_1 == 'ORGANIZATION' and entity_type_2 == 'PERSON'):
        F[id2][id1] = 1

In [130]:
def fullprint(*args, **kwargs):
    from pprint import pprint
    opt = np.get_printoptions()
    np.set_printoptions(threshold=np.inf)
    pprint(*args, **kwargs)
    np.set_printoptions(**opt)

In [142]:
def matrix_multiply(*arg):
    a = arg[0]
    b = arg[1]
    x = np.matmul(a, b)
    for i in range(2, len(arg)):
        x = np.matmul(x, arg[i])
    return x

Q = matrix_multiply(D, D.transpose())

idx_1, _ = get_id('helen ferre')
idx_2, _ = get_id('jeb bush')
print(idx_1)
print(idx_2)

print(Q[idx_1][idx_2])
print(len(Q))

190
379
1.0
640


In [131]:
G_A = nx.from_numpy_matrix(A)
#nx.draw(G_A)
#nx.write_graphml()
fullprint(A)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

### Dropping Corrupt

In [269]:
# commented ones not in dataset
corrupt = [
    #'Ivanka Trump',
    #'Eric Trump',
    #'Donald Trump Jr.',
    #'Rick Gates'
    'MICHAEL D. COHEN',
    'Paul Manafort',
    #'Vin Weber',
    #'Tony Podesta',
    #'Sam Patten',
    #'Maria Butina',
    #'Elena Alekseevna Khusyaynova',
    'Michael Flynn',
    #'Bijan Kian',
    #'Kamil Ekim Alptekin'
]


corrupt_remove = r.copy()

for c in corrupt:
    print(c, get_id(c))
    corrupt_remove.remove_node(c.upper())
    

# remove isolated nodes
corrupt_remove.remove_nodes_from(list(nx.isolates(corrupt_remove)))
# Take giant component
corrupt_remove = max(nx.connected_component_subgraphs(corrupt_remove), key=len)

nx.write_graphml(corrupt_remove, './corrupt-removed.graphml')

MICHAEL D. COHEN (226, 'PERSON')
Paul Manafort (237, 'PERSON')
Michael Flynn (227, 'PERSON')


### Q1. Degree seperation between Trump and Vladmir Putin

In [122]:
def is_zero_matrix(M):
    for i in range(len(M)):
        for j in range(len(M[i])):
            if M[i][j] != 0:
                return False
    return True

Test to see if there is a direct connection through shared acquaintances

In [123]:
idx_trump, _ = get_id('donald j. trump')
idx_putin, _ = get_id('vladimir putin')

A_prime = A
for i in range(10):
    if A_prime[idx_trump][idx_putin] > 0 or A_prime[idx_putin][idx_trump]:
        print("Degrees of seperation =", i)
        break
    A_prime = np.matmul(A_prime, A)
    
    # Terminate if zero matrix
    if is_zero_matrix(A_prime):
        break

Degrees of seperation = 1


In President's Trump network, the Kremilin is only 4 degree of seperation away.

In [None]:
We see that there is no relationship between shared

In [24]:
sorted(df[df['Entity A'].unique())

['4 SHADOW TREE LANE MEMBER CORP.',
 '40 WALL DEVELOPMENT ASSOCIATES LLC',
 '40 WALL STREET LLC',
 '40 WALL STREET MEMBER CORP.',
 '401 MEZZ VENTURE LLC',
 '401 NORTH WABASH VENTURE LLC',
 '4C INSIGHTS',
 '809 NORTH CANON MEMBER CORPORATION',
 'AARON SCHOCK',
 'ABDUL WAHID AL ULAMA',
 'ABE WALLACH',
 'ABERDEEN ASSET MANAGEMENT',
 'ACADEMI',
 'ACE ENTERTAINMENT HOLDINGS INC',
 'ADVISORS ASSET MANAGEMENT, INC.',
 'AETOS SA',
 'AJIT PAI',
 'ALABAMA POLICY INSTITUTE',
 'ALAN GARTEN',
 'ALAN HAMMER',
 'ALAN JOHN ROGERS',
 'ALBERT ROSS SR.',
 'ALEX SHNAIDER',
 'ALEXANDER ACOSTA',
 'ALEXANDER MASHKEVICH',
 'ALEXANDER MCMILLAN',
 'ALEXANDER NIX',
 'ALEXANDER TIMOFEEV',
 'ALLEN WEISSELBERG',
 'ALLIANCEBERNSTEIN LP',
 'ALTEGRIS',
 'ALTICOR',
 'AMBOY BANK',
 'AMERICA RISING LLC',
 'AMERICAN INTERNATIONAL GROUP INC.',
 'AMERICAN LEGACY CENTER',
 'AMERICANS FOR PROSPERITY',
 'AMG FUNDS',
 'AMPAL-AMERICAN ISRAEL CORP.',
 'ANAND PATEL',
 'ANAR MAMMADOV',
 'ANBANG INSURANCE GROUP',
 'ANDREA SCHLOSSBER

In [6]:
tokenizer = ToktokTokenizer()
lemmatizer = WordNetLemmatizer() 
stemmer = PorterStemmer()

STOP_WORDS = stopwords.words("english")

## Cleaning Text - strip HTML
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text


## Removing accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


## Expanding Contractions
def expand_contractions(text):
    return contractions.fix(text)


## Removing Special Characters
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text


## Lemmatizing text
def stem_text(text):
    return [stemmer.stem(a) for a in text]


## Removing Stopwords
def remove_stopwords(word_list):
    return [word for word in word_list if word not in STOP_WORDS]


## Normalize text corpus - tying it all together
def normalize_corpus(corpus, html_stripping=False, contraction_expansion=True,
                     accented_char_removal=True, 
                     stem=False, special_char_removal=True, 
                     stopword_removal=True):
    
    if isinstance(corpus, str):
        corpus = [corpus]
    
    normalized_corpus = []
    
    for doc in corpus:
        
        if doc is None:
            continue
            
        doc = doc.lower()
        
        if html_stripping:
            doc = strip_html_tags(doc)
        
        if accented_char_removal:
            doc = remove_accented_chars(doc)
            
        if contraction_expansion:
            doc = expand_contractions(doc)
            
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
        if special_char_removal:
            doc = remove_special_characters(doc) 
            
        doc = tokenizer.tokenize(doc)
        
        if stem:
            doc = stem_text(doc)
        
        if stopword_removal:
            doc = remove_stopwords(doc)
            
        normalized_corpus.append(' '.join(doc))
        
    return normalized_corpus[0] if (len(normalized_corpus) == 1) else normalized_corpus


  text = re.sub('[^a-zA-z0-9\s]', '', text)


### Read and prepare the data

In [19]:
def get_parsed_comments():
    data = None
    with open('twitter.txt', 'r') as myfile:
        data = myfile.read()
    results = []
    comments = data.split('---')
    for comment in comments:
        c = comment.strip().split('\n', 1)
        if len(c) < 2:
            continue
        author = c[0].strip()
        comment = c[1].strip()
        results.append(normalize_corpus(comment).split(' '))
    return results

texts = get_parsed_comments()

print(texts)

[['readonly', 'friday', 'time', 'put', 'feet', 'pour', 'nice', 'dram', 'look', 'members', 'new', 'shiny', 'blog', 'posts', 'feel', 'free', 'submit', 'blog', 'post', 'well', 'nice', 'description', 'thread'], ['decided', 'nat64', 'inside', 'autonomous', 'system', 'would', 'nice', '[', 'set', 'also', 'used', 'bgp', ']', 'httpblog', 'thelifeofkenneth', 'com201902runningnat64inbgpenvironment', 'html', 'pointing', 'static', 'nat64', 'server', 'seem', 'elegant', 'im', 'planning', 'sharing', 'friends', 'im', 'peered'], ['using', 'hypersegmentation', 'improve', 'network', 'security', 'hope', 'enjoy', '[', 'httpswww', '128technology', 'comblogblogusinghypersegmentationimprovenetworksecurity', ']', 'httpswww', '128technology', 'comblogblogusinghypersegmentationimprovenetworksecurity'], ['cloud', 'sessions', 'including', 'voip', 'calls', 'stay', 'active', 'internet', 'circuit', 'drops', '[', 'best', 'sdwan', 'vendors', 'cloud', 'session', 'reassignment', ']', 'httpsyoutu', 'bese25ykprphq'], ['tran

### Topic Modeling

In [20]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('twitter.mm', corpus)
#print(corpus)

lda = gensim.models.LdaModel(corpus, id2word=dictionary, alpha='auto', num_topics=10)
for i in lda.show_topics():
    print (i)
#convert ot BOW vectors

(0, '0.018*"problem" + 0.017*"solution" + 0.014*"get" + 0.013*"bring" + 0.011*"one" + 0.011*"blog" + 0.011*"nice" + 0.010*"time" + 0.009*"new" + 0.009*"feel"')
(1, '0.020*"site" + 0.016*"one" + 0.010*"im" + 0.010*"get" + 0.009*"network" + 0.009*"sites" + 0.008*"0" + 0.007*"two" + 0.007*"time" + 0.007*"know"')
(2, '0.013*"network" + 0.012*"site" + 0.010*"10" + 0.009*"2" + 0.008*"switches" + 0.007*"use" + 0.007*"would" + 0.007*"ampx200b" + 0.006*"1" + 0.006*"summarization"')
(3, '0.030*"10" + 0.015*"america" + 0.014*"0" + 0.013*"network" + 0.012*"n" + 0.011*"site" + 0.011*"016" + 0.011*"16" + 0.010*"two" + 0.009*"center"')
(4, '0.012*"cannot" + 0.011*"network" + 0.010*"get" + 0.009*"httpswww" + 0.008*"search" + 0.008*"hire" + 0.008*"company" + 0.007*"128technology" + 0.007*"comblogblogusinghypersegmentationimprovenetworksecurity" + 0.007*"far"')
(5, '0.013*"manager" + 0.012*"need" + 0.011*"new" + 0.010*"done" + 0.009*"work" + 0.008*"switch" + 0.008*"management" + 0.008*"get" + 0.008*"sit

In [21]:
topic_vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)

pyLDAvis.display(topic_vis)