## <div class="alert alert-block alert-success"><center>2020 - Data Preparation - Trump Part<center>

In [12]:
%pylab inline
import pandas as pd
df_trump2=pd.read_csv('trump2020.csv')

Populating the interactive namespace from numpy and matplotlib


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [13]:
len(df_trump2)

28125

In [14]:
df_trump2 = df_trump2[['committee_id', 'report_year', 'disbursement_date','recipient_name',\
                     'recipient_state','recipient_city',\
                     'disbursement_purpose_category','disbursement_description','disbursement_amount']]

In [15]:
df_trump2.disbursement_purpose_category.nunique()

8

In [16]:
df_trump2.disbursement_purpose_category.unique()

array(['OTHER', 'MATERIALS', 'ADMINISTRATIVE', 'TRAVEL', 'ADVERTISING',
       'TRANSFERS', 'FUNDRAISING', 'REFUNDS', nan], dtype=object)

In [17]:
# many uncategorized items under 'other'
df_trump2.groupby('disbursement_purpose_category').agg({'disbursement_purpose_category':'count','disbursement_amount':'sum'})

Unnamed: 0_level_0,disbursement_purpose_category,disbursement_amount
disbursement_purpose_category,Unnamed: 1_level_1,Unnamed: 2_level_1
ADMINISTRATIVE,3021,15316360.0
ADVERTISING,295,216034300.0
FUNDRAISING,50,73203.73
MATERIALS,570,19336710.0
OTHER,21187,187984200.0
REFUNDS,33,90770.17
TRANSFERS,60,15400460.0
TRAVEL,2908,2991812.0


In [18]:
df_trump2.disbursement_description.nunique()

301

In [19]:
df_trump2.disbursement_description[1:50]

1                    : BAGGAGE FEE
2                    : BAGGAGE FEE
3              : DELIVERY SERVICES
4                            : AIR
5          : GROUND TRANSPORTATION
6                            : AIR
7                    : CREDIT: AIR
8                    : CREDIT: AIR
9                : PARKING EXPENSE
10                           : AIR
11                    : CAR RENTAL
12                    : CAR RENTAL
13                          : FOOD
14            : ADVANCE CONSULTING
15         : GROUND TRANSPORTATION
16                       : MILEAGE
17                    : CAR RENTAL
18               : OFFICE SUPPLIES
19                           : AIR
20                           : AIR
21                    : CAR RENTAL
22             : DELIVERY SERVICES
23             : DELIVERY SERVICES
24             : DELIVERY SERVICES
25                       : LODGING
26                           : FEE
27                           : AIR
28                      : SOFTWARE
29                  

**<div class="alert alert-block alert-warning">Delete irrelevant rows: nan, refunds**

In [20]:
df_trump2.drop(df_trump2[df_trump2['disbursement_purpose_category'] == 'REFUNDS'].index, inplace = True) 

In [21]:
df_trump2.drop(df_trump2[df_trump2['disbursement_description'].str.contains('refund')].index, inplace = True) 

In [22]:
df_trump2 = df_trump2[df_trump2['disbursement_description'].notna()]

In [23]:
df_trump2.isnull().sum()

committee_id                     1
report_year                      1
disbursement_date                1
recipient_name                   1
recipient_state                  1
recipient_city                   1
disbursement_purpose_category    1
disbursement_description         0
disbursement_amount              1
dtype: int64

In [24]:
cat_sum = df_trump2.groupby('disbursement_description').agg({'disbursement_description':'count','disbursement_amount':'sum'})

In [25]:
cat_sum.rename(columns={'disbursement_description':'count'}, inplace=True)

In [26]:
cat_sum.sort_values('count',ascending=False,na_position='first')[1:20]

Unnamed: 0_level_0,count,disbursement_amount
disbursement_description,Unnamed: 1_level_1,Unnamed: 2_level_1
: AIR,4291,1981976.72
: GROUND TRANSPORTATION,3359,216625.82
PAYROLL,3034,10304479.1
: LODGING,2214,2358965.05
: CAR RENTAL,1571,606234.37
: OFFICE SUPPLIES,1033,697287.4
MERCHANT FEES,675,11691253.13
: FOOD,652,16537.78
: DELIVERY SERVICES,469,211004.55
: CREDIT: AIR,394,-206213.44


In [27]:
len(df_trump2)

28092

In [28]:
df_trump2.disbursement_amount.sum()

457137085.62

## <div class="alert alert-block alert-success"><center>Text Clustering - Trump Part<center>

In [29]:
# Get token list "lda_tokens"
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text): 
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace(): #skip space
            continue
        elif token.like_num: # delete numbers
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [30]:
# Get the puntuations set
import string
en_punct = set(string.punctuation)

# Get the stop words set
import nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

# Get the tokens for LDA analysis (function)
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhanghuishan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhanghuishan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in en_punct] # delete punctuations
    tokens = [token for token in tokens if len(token) > 1] # at least 2 letters consist a word
    tokens = [token for token in tokens if token not in en_stop] # delete stop words
    tokens = [get_lemma(token) for token in tokens] # delete word variants for accurate result
    return tokens

In [32]:
# Import data for LDA analysis and return text_data
import random
text_data = []
for line in df_trump2['disbursement_description']:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

In [33]:
# Cluster topics
from gensim import corpora 
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [34]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [37]:
import gensim # package for topic modeling
NUM_TOPICS = 5 # trial number based on understanding of the data file & research aims
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.301*"fee" + 0.272*"transaction" + 0.115*"office" + 0.108*"supply" + 0.030*"travel" + 0.016*"required"')
(1, '0.501*"air" + 0.082*"food" + 0.061*"credit" + 0.052*"expense" + 0.042*"meeting" + 0.042*"meal"')
(2, '0.457*"payroll" + 0.107*"consulting" + 0.082*"service" + 0.066*"software" + 0.066*"advance" + 0.019*"processing"')
(3, '0.191*"ground" + 0.191*"transportation" + 0.183*"rental" + 0.167*"car" + 0.043*"toll" + 0.035*"fuel"')
(4, '0.235*"lodging" + 0.102*"merchant" + 0.101*"fee" + 0.078*"expense" + 0.047*"service" + 0.047*"delivery"')


In [38]:
import pyLDAvis.gensim
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display) 

## <div class="alert alert-block alert-success"><center>Data Preparation - Hillary Part<center>

In [39]:
df_biden=pd.read_csv('biden2020.csv')
len(df_biden)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


54234

In [40]:
df_biden = df_biden[['committee_id', 'report_year', 'disbursement_date','recipient_name',\
                     'recipient_state','recipient_city',\
                     'disbursement_purpose_category','disbursement_description','disbursement_amount']]

In [41]:
df_biden.drop(df_biden[df_biden['disbursement_purpose_category'] == 'REFUNDS'].index, inplace = True) 
df_biden.drop(df_biden[df_biden['disbursement_description'].str.contains('refund')].index, inplace = True) 
df_biden = df_biden[df_biden['disbursement_description'].notna()]

In [42]:
df_biden.isnull().sum()

committee_id                     0
report_year                      0
disbursement_date                0
recipient_name                   0
recipient_state                  1
recipient_city                   2
disbursement_purpose_category    0
disbursement_description         0
disbursement_amount              0
dtype: int64

In [43]:
df_biden.groupby('disbursement_purpose_category').agg({'disbursement_purpose_category':'count','disbursement_amount':'sum'})

Unnamed: 0_level_0,disbursement_purpose_category,disbursement_amount
disbursement_purpose_category,Unnamed: 1_level_1,Unnamed: 2_level_1
ADMINISTRATIVE,3123,4454786.0
ADVERTISING,1061,426798100.0
CONTRIBUTIONS,2,6030.0
EVENTS,37,244211.2
FUNDRAISING,216,8133166.0
MATERIALS,93,19307.77
OTHER,21197,313608000.0
TRANSFERS,1,275000.0
TRAVEL,2198,1123009.0


In [59]:
df_biden.disbursement_amount.sum()

754661617.3299999

## <div class="alert alert-block alert-success"><center>Text Clustering - biden Part<center>

In [44]:
# Import data for LDA analysis and return text_data
import random
text_data = []
for line in df_biden['disbursement_description']:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

In [45]:
from gensim import corpora 
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [57]:
import gensim # package for topic modeling
NUM_TOPICS = 5 # trial number based on understanding of the data file & research aims
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.163*"credit" + 0.117*"consulting" + 0.073*"lodging" + 0.072*"utility" + 0.050*"strategic" + 0.027*"food"')
(1, '0.281*"supply" + 0.246*"office" + 0.084*"fee" + 0.084*"processing" + 0.049*"subscription" + 0.049*"event"')
(2, '0.312*"travel" + 0.168*"lodging" + 0.062*"rental" + 0.047*"medium" + 0.047*"vehicle" + 0.047*"production"')
(3, '0.689*"payroll" + 0.058*"printing" + 0.050*"copying" + 0.018*"license" + 0.018*"internet" + 0.018*"polling"')
(4, '0.139*"service" + 0.105*"parking" + 0.054*"per" + 0.054*"diem" + 0.054*"fee" + 0.037*"event"')


In [58]:
import pyLDAvis.gensim 
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display) 