## <div class="alert alert-block alert-success"><center>Data Preparation - Obama Part<center>

In [1]:
%pylab inline
import pandas as pd
df_obama=pd.read_csv('obama2008.csv')

Populating the interactive namespace from numpy and matplotlib


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
df_obama = df_obama[['committee_id', 'report_year', 'disbursement_date','recipient_name',\
                     'recipient_state','recipient_city',\
                     'disbursement_purpose_category','disbursement_description','disbursement_amount']]

In [3]:
# many uncategorized items under 'other'
df_obama.groupby('disbursement_purpose_category').agg({'disbursement_purpose_category':'count','disbursement_amount':'sum'})

Unnamed: 0_level_0,disbursement_purpose_category,disbursement_amount
disbursement_purpose_category,Unnamed: 1_level_1,Unnamed: 2_level_1
ADMINISTRATIVE,4769,22895670.0
ADVERTISING,3722,390354700.0
FUNDRAISING,2206,2542922.0
MATERIALS,28,16048.21
OTHER,59349,231226000.0
REFUNDS,6433,5086160.0
TRAVEL,41342,76877530.0


**<div class="alert alert-block alert-warning">Delete irrelevant rows: nan, refunds**

In [4]:
df_obama.drop(df_obama[df_obama['disbursement_purpose_category'] == 'REFUNDS'].index, inplace = True) 
df_obama.drop(df_obama[df_obama['disbursement_description'] == 'CONTRIBUTION REFUND'].index, inplace = True) 

In [5]:
df_obama = df_obama[df_obama['disbursement_description'].notna()]

In [9]:
df_obama.isnull().sum()

committee_id                       0
report_year                        0
disbursement_date                  0
recipient_name                     0
recipient_state                  174
recipient_city                   115
disbursement_purpose_category      0
disbursement_description           0
disbursement_amount                0
dtype: int64

In [10]:
obama_cat_sum = df_obama.groupby('disbursement_description').agg({'disbursement_description':'count','disbursement_amount':'sum'})

In [11]:
obama_cat_sum.rename(columns={'disbursement_description':'count'}, inplace=True)

In [12]:
pd.options.display.max_rows = None

In [13]:
obama_cat_sum.sort_values('count',ascending=False,na_position='first')

Unnamed: 0_level_0,count,disbursement_amount
disbursement_description,Unnamed: 1_level_1,Unnamed: 2_level_1
TRAVEL/LODGING,41215,76838300.0
PAYROLL,17920,30880120.0
TRAVEL AGENCY FEES,14734,420297.0
PER DIEM,7823,2785564.0
"STAGING, SOUND, LIGHTING",3406,26683760.0
TELEPHONE,3337,5261824.0
PAYROLL TAXES,2477,3480334.0
ON-LINE ADVERTISING,2334,34407410.0
CATERING/FACILITIES,2046,2486207.0
PRINTING,1862,7923893.0


In [6]:
len(df_obama)

111362

## <div class="alert alert-block alert-success"><center>Text Clustering - Obama Part<center>

import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en

In [7]:
# Get token list "lda_tokens"
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text): 
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace(): #skip space
            continue
        elif token.like_num: # delete numbers
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [8]:
# Get the puntuations set
import string
en_punct = set(string.punctuation)

# Get the stop words set
import nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

# Get the tokens for LDA analysis (function)
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhanghuishan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhanghuishan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in en_punct] # delete punctuations
    tokens = [token for token in tokens if len(token) > 1] # at least 2 letters consist a word
    tokens = [token for token in tokens if token not in en_stop] # delete stop words
    tokens = [get_lemma(token) for token in tokens] # delete word variants for accurate result
    return tokens

In [10]:
# Import data for LDA analysis and return text_data
import random
text_data = []
for line in df_obama['disbursement_description']:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

pip install gensim

In [11]:
# Cluster topics
from gensim import corpora 
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [12]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [13]:
import gensim # package for topic modeling
NUM_TOPICS = 5 # trial number based on understanding of the data file & research aims
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.692*"payroll" + 0.094*"tax" + 0.044*"catering" + 0.044*"facility" + 0.029*"rent" + 0.029*"occupancy"')
(1, '0.284*"per" + 0.284*"diem" + 0.084*"advertising" + 0.081*"line" + 0.059*"advance" + 0.059*"work"')
(2, '0.230*"fee" + 0.222*"agency" + 0.222*"travel" + 0.073*"lighting" + 0.073*"staging" + 0.073*"sound"')
(3, '0.243*"telephone" + 0.153*"office" + 0.153*"supply" + 0.153*"printing" + 0.050*"delivery" + 0.029*"medium"')
(4, '0.472*"travel" + 0.472*"lodging" + 0.011*"rental" + 0.008*"event" + 0.008*"site" + 0.005*"postage"')


pip install pyldavis 

In [28]:
import pyLDAvis 

pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install --user

In [14]:
import pyLDAvis.gensim

In [29]:
#import gensim
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display) 

## <div class="alert alert-block alert-success"><center>Data Preparation - Mccain Part<center>

In [33]:
df_mccain=pd.read_csv('mccain2008.csv')
len(df_mccain)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


36905

In [34]:
df_mccain = df_mccain[df_mccain.report_year == 2008]

In [35]:
len(df_mccain)

36905

In [36]:
df_mccain = df_mccain[['committee_id', 'report_year', 'disbursement_date','recipient_name',\
                     'recipient_state','recipient_city',\
                     'disbursement_purpose_category','disbursement_description','disbursement_amount']]

In [None]:
df_mccain.disbursement_purpose_category.nunique()

In [None]:
df_mccain.disbursement_purpose_category.unique()

In [None]:
df_mccain.disbursement_description.nunique()

In [37]:
df_mccain.drop(df_mccain[df_mccain['disbursement_purpose_category'] == 'REFUNDS'].index, inplace = True) 
df_mccain.drop(df_mccain[df_mccain['disbursement_description'] == 'CONTRIBUTION REFUND'].index, inplace = True) 
df_mccain = df_mccain[df_mccain['disbursement_description'].notna()]

In [38]:
df_mccain.isnull().sum()

committee_id                      0
report_year                       0
disbursement_date                 0
recipient_name                    0
recipient_state                  10
recipient_city                    0
disbursement_purpose_category     0
disbursement_description          0
disbursement_amount               0
dtype: int64

## <div class="alert alert-block alert-success"><center>Text Clustering - Mccain Part<center>

In [39]:
# Import data for LDA analysis and return text_data
import random
text_data = []
for line in df_mccain['disbursement_description']:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

In [40]:
from gensim import corpora 
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [43]:
import gensim # package for topic modeling
NUM_TOPICS = 4 # trial number based on understanding of the data file & research aims
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.370*"payroll" + 0.066*"tax" + 0.066*"printing" + 0.053*"credit"')
(1, '0.110*"catering" + 0.089*"equipment" + 0.085*"petty" + 0.085*"cash-"')
(2, '0.517*"travel" + 0.164*"transfer" + 0.030*"beverage" + 0.030*"food"')
(3, '0.148*"consulting" + 0.094*"gotv" + 0.058*"staging" + 0.042*"phone"')


In [44]:
import pyLDAvis 
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display) 