## <div class="alert alert-block alert-success"><center>2016 - Data Preparation - Trump Part<center>

In [1]:
%pylab inline
import pandas as pd
df_trump=pd.read_csv('trump2016.csv')

Populating the interactive namespace from numpy and matplotlib


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
len(df_trump)

27016

In [3]:
df_trump = df_trump[['committee_id', 'report_year', 'disbursement_date','recipient_name',\
                     'recipient_state','recipient_city',\
                     'disbursement_purpose_category','disbursement_description','disbursement_amount']]

In [4]:
df_trump.disbursement_purpose_category.nunique()

9

In [5]:
df_trump.disbursement_purpose_category.unique()

array(['OTHER', 'TRAVEL', 'ADMINISTRATIVE', 'ADVERTISING', 'MATERIALS',
       'REFUNDS', 'FUNDRAISING', 'TRANSFERS', 'CONTRIBUTIONS'],
      dtype=object)

In [6]:
# many uncategorized items under 'other'
df_trump.groupby('disbursement_purpose_category').agg({'disbursement_purpose_category':'count','disbursement_amount':'sum'})

Unnamed: 0_level_0,disbursement_purpose_category,disbursement_amount
disbursement_purpose_category,Unnamed: 1_level_1,Unnamed: 2_level_1
ADMINISTRATIVE,3920,14430600.0
ADVERTISING,545,111369400.0
CONTRIBUTIONS,1,13776.94
FUNDRAISING,60,271777.0
MATERIALS,1091,5732981.0
OTHER,14760,208890100.0
REFUNDS,845,1477020.0
TRANSFERS,1,8023.32
TRAVEL,5793,7865029.0


In [7]:
df_trump.disbursement_description.nunique()

357

In [8]:
df_trump.disbursement_description[1:50]

1                                   AIR 
2                                   AIR 
3                                   AIR 
4                               MILEAGE 
5                                 MEALS 
6                                   AIR 
7                                 MEALS 
8                                 MEALS 
9                                 MEALS 
10                                MEALS 
11                                MEALS 
12                              LODGING 
13                                  AIR 
14                                  AIR 
15                     RENT & UTILITIES 
16                              LODGING 
17    FACILITY RENTAL/CATERING SERVICES 
18                            UTILITIES 
19                      OFFICE SUPPLIES 
20                                  AIR 
21                      OFFICE SUPPLIES 
22                           CAR RENTAL 
23                           CAR RENTAL 
24                    BROADBAND SERVICE 
25              

**<div class="alert alert-block alert-warning">Delete irrelevant rows: nan, refunds**

In [2]:
df_trump.drop(df_trump[df_trump['disbursement_purpose_category'] == 'REFUNDS'].index, inplace = True) 

In [3]:
df_trump.drop(df_trump[df_trump['disbursement_description'].str.contains('refund')].index, inplace = True) 

In [4]:
df_trump = df_trump[df_trump['disbursement_description'].notna()]

In [12]:
df_trump.isnull().sum()

committee_id                     0
report_year                      0
disbursement_date                0
recipient_name                   0
recipient_state                  0
recipient_city                   0
disbursement_purpose_category    0
disbursement_description         0
disbursement_amount              0
dtype: int64

In [13]:
cat_sum = df_trump.groupby('disbursement_description').agg({'disbursement_description':'count','disbursement_amount':'sum'})

In [14]:
cat_sum.rename(columns={'disbursement_description':'count'}, inplace=True)

In [15]:
cat_sum.sort_values('count',ascending=False,na_position='first')[1:20]

Unnamed: 0_level_0,count,disbursement_amount
disbursement_description,Unnamed: 1_level_1,Unnamed: 2_level_1
LODGING,4675,6037775.31
PAYROLL,2469,5854896.42
CAR RENTAL,1681,1069615.07
FIELD CONSULTING,964,7924240.74
AIR -,818,497669.4
LODGING -,762,1340012.67
TRAVEL EXPENSE REIMBURSEMENT: ITEMIZATION BELOW IF REQUIRED,596,775510.44
EVENT CONSULTING,578,3085075.24
OFFICE SUPPLIES,519,434390.86
MERCHANT FEE,497,3635397.73


In [16]:
len(df_trump)

26171

In [5]:
df_trump.disbursement_amount.sum()

348581777.1

## <div class="alert alert-block alert-success"><center>Text Clustering - Trump Part<center>

In [17]:
# Get token list "lda_tokens"
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text): 
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace(): #skip space
            continue
        elif token.like_num: # delete numbers
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [18]:
# Get the puntuations set
import string
en_punct = set(string.punctuation)

# Get the stop words set
import nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

# Get the tokens for LDA analysis (function)
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhanghuishan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhanghuishan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in en_punct] # delete punctuations
    tokens = [token for token in tokens if len(token) > 1] # at least 2 letters consist a word
    tokens = [token for token in tokens if token not in en_stop] # delete stop words
    tokens = [get_lemma(token) for token in tokens] # delete word variants for accurate result
    return tokens

In [20]:
# Import data for LDA analysis and return text_data
import random
text_data = []
for line in df_trump['disbursement_description']:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

In [21]:
# Cluster topics
from gensim import corpora 
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [22]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [41]:
import gensim # package for topic modeling
NUM_TOPICS = 5 # trial number based on understanding of the data file & research aims
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.155*"rental" + 0.155*"catering" + 0.105*"service" + 0.088*"equipment"')
(1, '0.395*"transfer" + 0.092*"staging" + 0.075*"food" + 0.075*"beverage"')
(2, '0.674*"travel" + 0.058*"purchase" + 0.058*"equipment" + 0.049*"gotv"')
(3, '0.166*"supply" + 0.166*"office" + 0.112*"svc" + 0.086*"transportation"')
(4, '0.507*"payroll" + 0.101*"tax" + 0.101*"printing" + 0.064*"consulting"')


In [25]:
import pyLDAvis 

In [26]:
import pyLDAvis.gensim

In [42]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display) 

## <div class="alert alert-block alert-success"><center>Data Preparation - Hillary Part<center>

In [6]:
df_hillary=pd.read_csv('hillary2016.csv')
len(df_hillary)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


147564

In [44]:
df_hillary = df_hillary[['committee_id', 'report_year', 'disbursement_date','recipient_name',\
                     'recipient_state','recipient_city',\
                     'disbursement_purpose_category','disbursement_description','disbursement_amount']]

In [7]:
df_hillary.drop(df_hillary[df_hillary['disbursement_purpose_category'] == 'REFUNDS'].index, inplace = True) 
df_hillary.drop(df_hillary[df_hillary['disbursement_description'].str.contains('refund')].index, inplace = True) 
df_hillary = df_hillary[df_hillary['disbursement_description'].notna()]

In [48]:
df_hillary.isnull().sum()

committee_id                       0
report_year                        0
disbursement_date                  0
recipient_name                     0
recipient_state                  220
recipient_city                    55
disbursement_purpose_category      0
disbursement_description           0
disbursement_amount                0
dtype: int64

In [51]:
df_hillary.groupby('disbursement_purpose_category').agg({'disbursement_purpose_category':'count','disbursement_amount':'sum'})

Unnamed: 0_level_0,disbursement_purpose_category,disbursement_amount
disbursement_purpose_category,Unnamed: 1_level_1,Unnamed: 2_level_1
ADMINISTRATIVE,3350,4876015.0
ADVERTISING,2526,307434800.0
CONTRIBUTIONS,21,325871.2
FUNDRAISING,3599,7175135.0
MATERIALS,2,503.0
OTHER,107911,205722700.0
TRAVEL,29,22446.41


In [9]:
df_hillary.disbursement_amount.sum()

525557426.2

## <div class="alert alert-block alert-success"><center>Text Clustering - Hillary Part<center>

In [49]:
# Import data for LDA analysis and return text_data
import random
text_data = []
for line in df_hillary['disbursement_description']:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

In [50]:
from gensim import corpora 
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [78]:
import gensim # package for topic modeling
NUM_TOPICS = 4 # trial number based on understanding of the data file & research aims
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.236*"service" + 0.236*"printing" + 0.055*"audio" + 0.055*"visual"')
(1, '0.332*"supply" + 0.284*"event" + 0.087*"office" + 0.046*"production"')
(2, '0.405*"payroll" + 0.384*"phone" + 0.047*"shipping" + 0.035*"fee"')
(3, '0.746*"travel" + 0.074*"subsistence" + 0.042*"catering" + 0.042*"food"')


In [79]:
import pyLDAvis.gensim 
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display) 