# Contributions from out-of-state donors

In [1]:
import pandas as pd
import us

%load_ext jupyternotify

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 500)

<IPython.core.display.Javascript object>

Import [National Institute on Money in Politics](https://www.followthemoney.org/) API key.

In [2]:
nimp_key = open("nimp_api_key.txt", "r").readline()

## Import and format the data

### Import contribution-level data for gubernatorial, attorney general, secretary of state and state legislative races in 2018

In [None]:
%%notify
contribs_18 = pd.read_csv("https://www.followthemoney.org/aaengine/aafetch.php?dt=1&y=2018&c-exi=1&c-r-oc=Z10,Z70&c-r-ot=G,S,H&gro=s,d-id&APIKey="+nimp_key+"&mode=csv",
                          encoding="ISO-8859-1", error_bad_lines=False)
contribs_18.to_csv("data/contribs_18.csv", index=False)

In [46]:
contribs_18 = pd.read_csv("data/contribs_18.csv", encoding="ISO-8859-1")
contribs_18.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 49 columns):
request                        object
Election_Jurisdiction:token    object
Election_Jurisdiction:id       object
Election_Jurisdiction          object
Original_Name:token            object
Original_Name:id               int64
Original_Name                  object
Contributor:token              object
Contributor:id                 object
Contributor                    object
Type_of_Contributor:token      object
Type_of_Contributor:id         object
Type_of_Contributor            object
Specific_Business:token        object
Specific_Business:id           object
Specific_Business              object
General_Industry:token         object
General_Industry:id            object
General_Industry               object
Broad_Sector:token             object
Broad_Sector:id                object
Broad_Sector                   object
Amount:token                   object
Amount:id     

Convert the contribution amount column to numeric (float) data type.

In [47]:
contribs_18["Amount"] = pd.to_numeric(contribs_18["Amount"], errors="coerce")
contribs_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 49 columns):
request                        object
Election_Jurisdiction:token    object
Election_Jurisdiction:id       object
Election_Jurisdiction          object
Original_Name:token            object
Original_Name:id               int64
Original_Name                  object
Contributor:token              object
Contributor:id                 object
Contributor                    object
Type_of_Contributor:token      object
Type_of_Contributor:id         object
Type_of_Contributor            object
Specific_Business:token        object
Specific_Business:id           object
Specific_Business              object
General_Industry:token         object
General_Industry:id            object
General_Industry               object
Broad_Sector:token             object
Broad_Sector:id                object
Broad_Sector                   object
Amount:token                   object
Amount:id     

Convert the contribution date column to datetime data type.

In [None]:
contribs_18["Date"] = pd.to_datetime(contribs_18["Date"], errors="coerce")
contribs_18.info()

Filter out unitemized donations as it is impossible to determine where those contributions originated. Then filter the data to just the state, contribution amount, contribution date and in-vs.-out-of-state columns.

In [49]:
contribs_18 = contribs_18[contribs_18["Contributor"] != "UNITEMIZED DONATIONS"]
contribs_18 = contribs_18[["Election_Jurisdiction", "Amount", "Date", "In-State"]]
contribs_18.rename(columns={"Election_Jurisdiction": "state", "Amount": "amount", "Date": "date", "In-State": "in_out_state"}, inplace=True)
contribs_18.info()

Unnamed: 0,state,amount,date
0,IL,50000000.0,2016-12-20


Rename the categories in the in-vs.-out-of-state column.

In [None]:
# 0 = out-of-state, 1 = in-state, 2 = unknown
contribs_18["in_out_state"] = contribs_18["in_out_state"].replace({0: "out-of-state", 1: "in-state", 2: "unknown"})
contribs_18.head(1)

Extract the month and year from the contribution date column.

In [54]:
contribs_18["month"] = contribs_18["date"].dt.to_period("M")
contribs_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 4 columns):
state         object
amount        float64
date          datetime64[ns]
month_year    object
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 64.7+ MB


Group the contributions by state and month.

In [70]:
grouped_by_month = contribs_18.groupby(["state", "month"])["amount"].sum().reset_index()
contribs_18.drop("month", axis=1) # The month column is no longer needed
grouped_by_month.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 3 columns):
state         1464 non-null object
month_year    1464 non-null object
amount        1464 non-null float64
dtypes: float64(1), object(2)
memory usage: 34.4+ KB


Because we eventually want to use each state's month column as the cut-off date for contributions, we need to add a day to the month and the year and then convert the column into datetime data type.

In [71]:
grouped_by_month["month"] = grouped_by_month["month"].astype(str) + "-28" # No month has fewer than 28 days
grouped_by_month["month"] = pd.to_datetime(grouped_by_month["month"], errors="coerce")
grouped_by_month.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 3 columns):
state         1464 non-null object
month_year    1464 non-null object
amount        1464 non-null float64
dtypes: float64(1), object(2)
memory usage: 34.4+ KB


In [74]:
grouped_by_month.head()

Unnamed: 0,state,month_year,amount
0,AK,2017-04-28,223.93
1,AK,2017-05-28,1177.04
2,AK,2017-06-28,1750.0
3,AK,2017-07-28,9684.21
4,AK,2017-08-28,63999.19


We know some of the contribution dates are wrong. We know this because some of the dates occur in the future and, unless we've got some time travelling campaign donors, these are data entry errors. To eliminate this noise, we will filter out months after August 2018 or months with fewer than $2,500 in contributions. This latter filter will catch contributions that were miscoded as occuring during or before August 2018.

In [75]:
grouped_by_month = grouped_by_month[(grouped_by_month["month"] <= "2018-08-28") & (grouped_by_month["amount"] >= 2500)]
grouped_by_month.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1173 entries, 3 to 1463
Data columns (total 3 columns):
state         1173 non-null object
month_year    1173 non-null datetime64[ns]
amount        1173 non-null float64
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 36.7+ KB


Return the most recent month with at least $2,500 in contributions for each state.

In [76]:
latest_month = grouped_by_month.groupby("state")["month"].max().reset_index()
latest_month.rename(columns={"month": "latest_month"}, inplace=True)
latest_month

Unnamed: 0,state,month_year
0,AK,2018-07-28
1,AL,2018-07-28
2,AR,2018-03-28
3,AZ,2017-12-28
4,CA,2018-07-28
5,CO,2018-06-28
6,CT,2018-03-28
7,FL,2018-08-28
8,GA,2018-07-28
9,HI,2017-12-28


### Import contribution-level data for gubernatorial, attorney general, secretary of state and state legislative races in 2014

In [None]:
%%notify
contribs_14 = pd.read_csv("https://www.followthemoney.org/aaengine/aafetch.php?dt=1&y=2014&c-exi=1&c-r-oc=Z10,Z70&c-r-ot=G,S,H&gro=s,d-id&APIKey="+nimp_key+"&mode=csv",
                          encoding="ISO-8859-1", error_bad_lines=False)
contribs_14.to_csv("data/contribs_14.csv", index=False)

In [46]:
contribs_14 = pd.read_csv("data/contribs_14.csv", encoding="ISO-8859-1")
contribs_14.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 49 columns):
request                        object
Election_Jurisdiction:token    object
Election_Jurisdiction:id       object
Election_Jurisdiction          object
Original_Name:token            object
Original_Name:id               int64
Original_Name                  object
Contributor:token              object
Contributor:id                 object
Contributor                    object
Type_of_Contributor:token      object
Type_of_Contributor:id         object
Type_of_Contributor            object
Specific_Business:token        object
Specific_Business:id           object
Specific_Business              object
General_Industry:token         object
General_Industry:id            object
General_Industry               object
Broad_Sector:token             object
Broad_Sector:id                object
Broad_Sector                   object
Amount:token                   object
Amount:id     

Convert contribution amount column to numeric (float) data type.

In [47]:
contribs_14["Amount"] = pd.to_numeric(contribs_14["Amount"], errors="coerce")
contribs_14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 49 columns):
request                        object
Election_Jurisdiction:token    object
Election_Jurisdiction:id       object
Election_Jurisdiction          object
Original_Name:token            object
Original_Name:id               int64
Original_Name                  object
Contributor:token              object
Contributor:id                 object
Contributor                    object
Type_of_Contributor:token      object
Type_of_Contributor:id         object
Type_of_Contributor            object
Specific_Business:token        object
Specific_Business:id           object
Specific_Business              object
General_Industry:token         object
General_Industry:id            object
General_Industry               object
Broad_Sector:token             object
Broad_Sector:id                object
Broad_Sector                   object
Amount:token                   object
Amount:id     

Convert the contribution date column to datetime data type.

In [None]:
contribs_14["Date"] = pd.to_datetime(contribs_14["Date"], errors="coerce")
contribs_14.info()

Filter out unitemized donations as it is impossible to determine where those contributions originated. Then filter the data to just the state, contribution amount, contribution date and in-vs.-out-of-state columns.

In [None]:
contribs_14 = contribs_14[contribs_14["Contributor"] != "UNITEMIZED DONATIONS"]
contribs_14 = contribs_14[["Election_Jurisdiction", "Amount", "Date", "In-State"]]
contribs_14.rename(columns={"Election_Jurisdiction": "state", "Amount": "amount", "Date": "date", "In-State": "in_out_state"}, inplace=True)
contribs_14.head(1)

Rename the categories in the in-vs.-out-of-state column.

In [None]:
# 0 = out-of-state, 1 = in-state, 2 = unknown
contribs_14["in_out_state"] = contribs_14["in_out_state"].replace({0: "out-of-state", 1: "in-state", 2: "unknown"})
contribs_14.head(1)

### Import contribution-level data for gubernatorial, attorney general, secretary of state and state legislative races in 2010

In [None]:
%%notify
contribs_10 = pd.read_csv("https://www.followthemoney.org/aaengine/aafetch.php?dt=1&y=2010&c-exi=1&c-r-oc=Z10,Z70&c-r-ot=G,S,H&gro=s,d-id&APIKey="+nimp_key+"&mode=csv",
                          encoding="ISO-8859-1", error_bad_lines=False)
contribs_10.to_csv("data/contribs_10.csv", index=False)

In [46]:
contribs_10 = pd.read_csv("data/contribs_10.csv", encoding="ISO-8859-1")
contribs_10.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 49 columns):
request                        object
Election_Jurisdiction:token    object
Election_Jurisdiction:id       object
Election_Jurisdiction          object
Original_Name:token            object
Original_Name:id               int64
Original_Name                  object
Contributor:token              object
Contributor:id                 object
Contributor                    object
Type_of_Contributor:token      object
Type_of_Contributor:id         object
Type_of_Contributor            object
Specific_Business:token        object
Specific_Business:id           object
Specific_Business              object
General_Industry:token         object
General_Industry:id            object
General_Industry               object
Broad_Sector:token             object
Broad_Sector:id                object
Broad_Sector                   object
Amount:token                   object
Amount:id     

Convert contribution amount column to numeric (float) data type.

In [47]:
contribs_10["Amount"] = pd.to_numeric(contribs_10["Amount"], errors="coerce")
contribs_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118572 entries, 0 to 2118571
Data columns (total 49 columns):
request                        object
Election_Jurisdiction:token    object
Election_Jurisdiction:id       object
Election_Jurisdiction          object
Original_Name:token            object
Original_Name:id               int64
Original_Name                  object
Contributor:token              object
Contributor:id                 object
Contributor                    object
Type_of_Contributor:token      object
Type_of_Contributor:id         object
Type_of_Contributor            object
Specific_Business:token        object
Specific_Business:id           object
Specific_Business              object
General_Industry:token         object
General_Industry:id            object
General_Industry               object
Broad_Sector:token             object
Broad_Sector:id                object
Broad_Sector                   object
Amount:token                   object
Amount:id     

Convert the contribution date column to datetime data type.

In [None]:
contribs_10["Date"] = pd.to_datetime(contribs_10["Date"], errors="coerce")
contribs_10.info()

Filter out unitemized donations as it is impossible to determine where those contributions originated. Then filter the data to just the state, contribution amount, contribution date and in-vs.-out-of-state columns.

In [None]:
contribs_10 = contribs_10[contribs_10["Contributor"] != "UNITEMIZED DONATIONS"]
contribs_10 = contribs_10[["Election_Jurisdiction", "Amount", "Date", "In-State"]]
contribs_10.rename(columns={"Election_Jurisdiction": "state", "Amount": "amount", "Date": "date", "In-State": "in_out_state"}, inplace=True)
contribs_10.head(1)

Rename the categories in the in-vs.-out-of-state column.

In [None]:
# 0 = out-of-state, 1 = in-state, 2 = unknown
contribs_10["in_out_state"] = contribs_10["in_out_state"].replace({0: "out-of-state", 1: "in-state", 2: "unknown"})
contribs_10.head(1)

### Import candidate-level data for gubernatorial, attorney general, secretary of state and state legislative races in 2018

In [None]:
%%notify
candidates_18 = pd.read_csv("https://www.followthemoney.org/aaengine/aafetch.php?dt=1&y=2018&c-exi=1&c-r-oc=Z10,Z70&c-r-ot=G,S,H&gro=c-t-id,d-ins&APIKey="+nimp_key+"&mode=csv",
                          encoding="ISO-8859-1", error_bad_lines=False)
candidates_18.to_csv("data/candidates_18.csv", index=False)

In [None]:
candidates_18 = pd.read_csv("data/candidates_18.csv", encoding="ISO-8859-1")
candidates_18.info()

Filter out unitemized donations as it is impossible to determine where those contributions originated. Then filter the data to just the candidate, candidate ID, state, district, in-vs.-out-of-state and total contributions columns.

In [None]:
candidates_18 = candidates_18[candidates_18["Contributor"] != "UNITEMIZED DONATIONS"]
candidates_18 = candidates_18[["Candidate", "Candidate:id", "Election_Jurisdiction", "Office_Sought", "In-State", "Total_$"]]
candidates_18.rename(columns={"Candidate": "candidate", "Candidate:id": "candidate_id", "Election_Jurisdiction": "state", "Office_Sought": "district", "In-State": "in_out_state", "Total_$": "contributions"}, inplace=True)
candidates_18.head(1)

Rename the categories in the in-vs.-out-of-state column.

In [None]:
# 0 = out-of-state, 1 = in-state, 2 = unknown
candidates_18["in_out_state"] = candidates_18["in_out_state"].replace({0: "out-of-state", 1: "in-state", 2: "unknown"})
candidates_18.head(1)

## Join and filter the data

### Join the table of the 2018 cycle's latest contribution months with the 2018, 2014 and 2010 contribution-level data

In [None]:
contribs_18 = contribs_18.merge(latest_month, on="state")
contribs_18.info()

In [None]:
contribs_18.head(1)

In [None]:
contribs_14 = contribs_14.merge(latest_month, on="state")
contribs_14.info()

In [None]:
contribs_10.head(1)

In [None]:
contribs_10 = contribs_10.merge(latest_month, on="state")
contribs_10.info()

In [None]:
contribs_10.head(1)

### Filter the data to eliminate contributions after the 2018 cycle's latest contribution month in each state

In [None]:
contribs_18 = contribs_18[contribs_18["contribution_date"] <= contribs_18["latest_month"]]
contribs_18.info()

In [None]:
contribs_18.head(1)

In [None]:
contribs_14 = contribs_14[contribs_14["contribution_date"] <= contribs_14["latest_month"]]
contribs_14.info()

In [None]:
contribs_14.head(1)

In [None]:
contribs_10 = contribs_10[contribs_10["contribution_date"] <= contribs_10["latest_month"]]
contribs_10.info()

In [None]:
contribs_10.head(1)

## Analyze the data

### Calculate out-of-state contributions by state in 2018

Calculate contributions by in-vs.-out-of-state status and group by state.

In [19]:
contribs_by_state_18 = contribs_18.groupby(["state", "in_out_state"])["contributions_18"].sum().reset_index()
contribs_by_state_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 3 columns):
state            116 non-null object
in_out_state     116 non-null object
contributions    116 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.8+ KB


In [20]:
contribs_by_state_18.head()

Unnamed: 0,state,in_out_state,contributions
0,AK,in-state,2423533.25
1,AK,out-of-state,110922.49
2,AK,unknown,9285.0
3,AL,in-state,5185371.17
4,AL,out-of-state,532085.12


Pivot dataframe to aggregate each state's data in a single row.

In [21]:
contribs_by_state_18 = pd.pivot_table(contribs_by_state_18, index=["state"], columns=["in_out_state"]).reset_index()
contribs_by_state_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
(state, )                        41 non-null object
(contributions, in-state)        41 non-null float64
(contributions, out-of-state)    41 non-null float64
(contributions, unknown)         34 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


It appears that some states have no contributions categorized as "unknown." Let's set those values equal to zero to be sure any calculations we run on them are correct.

In [22]:
contribs_by_state_18.fillna(0, inplace=True)
contribs_by_state_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
(state, )                        41 non-null object
(contributions, in-state)        41 non-null float64
(contributions, out-of-state)    41 non-null float64
(contributions, unknown)         41 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


Flatten the resulting dataframe's multi-index columns.

In [23]:
contribs_by_state_18.columns = ["_".join(column).replace("-","_").strip("_") for column in contribs_by_state_18.columns.values]
contribs_by_state_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
state                         41 non-null object
contributions_in_state        41 non-null float64
contributions_out_of_state    41 non-null float64
contributions_unknown         41 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


Calculate the proportion of out-of-state contributions.

In [26]:
contribs_by_state_18["pct_18_out_of_state"] = contribs_by_state_18["contributions_18_out_of_state"] / (contribs_by_state_18["contributions_18_in_state"] + contribs_by_state_18["contributions_18_out_of_state"] + contribs_by_state_18["contributions_18_unknown"])
contribs_by_state_18.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41 entries, 0 to 40
Data columns (total 8 columns):
state                            41 non-null object
fips                             41 non-null object
contributions_in_state           41 non-null float64
contributions_out_of_state       41 non-null float64
contributions_unknown            41 non-null float64
population                       41 non-null int64
pct_out_of_state                 41 non-null float64
out_of_state_per_thousand_pop    41 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 2.9+ KB


### Calculate out-of-state contributions by state in 2014

Calculate contributions by in-vs.-out-of-state status and group by state.

In [19]:
contribs_by_state_14 = contribs_14.groupby(["state", "in_out_state"])["contributions_14"].sum().reset_index()
contribs_by_state_14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 3 columns):
state            116 non-null object
in_out_state     116 non-null object
contributions    116 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.8+ KB


In [20]:
contribs_by_state_14.head()

Unnamed: 0,state,in_out_state,contributions
0,AK,in-state,2423533.25
1,AK,out-of-state,110922.49
2,AK,unknown,9285.0
3,AL,in-state,5185371.17
4,AL,out-of-state,532085.12


Pivot dataframe to aggregate each state's data in a single row.

In [21]:
contribs_by_state_14 = pd.pivot_table(contribs_by_state_14, index=["state"], columns=["in_out_state"]).reset_index()
contribs_by_state_14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
(state, )                        41 non-null object
(contributions, in-state)        41 non-null float64
(contributions, out-of-state)    41 non-null float64
(contributions, unknown)         34 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


It appears that some states have no contributions categorized as "unknown." Let's set those values equal to zero to be sure any calculations we run on them are correct.

In [22]:
contribs_by_state_14.fillna(0, inplace=True)
contribs_by_state_14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
(state, )                        41 non-null object
(contributions, in-state)        41 non-null float64
(contributions, out-of-state)    41 non-null float64
(contributions, unknown)         41 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


Flatten the resulting dataframe's multi-index columns.

In [23]:
contribs_by_state_14.columns = ["_".join(column).replace("-","_").strip("_") for column in contribs_by_state_14.columns.values]
contribs_by_state_14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
state                         41 non-null object
contributions_in_state        41 non-null float64
contributions_out_of_state    41 non-null float64
contributions_unknown         41 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


Calculate the proportion of out-of-state contributions.

In [26]:
contribs_by_state_14["pct_14_out_of_state"] = contribs_by_state_14["contributions_14_out_of_state"] / (contribs_by_state_14["contributions_14_in_state"] + contribs_by_state_14["contributions_14_out_of_state"] + contribs_by_state_14["contributions_14_unknown"])
contribs_by_state_14.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41 entries, 0 to 40
Data columns (total 8 columns):
state                            41 non-null object
fips                             41 non-null object
contributions_in_state           41 non-null float64
contributions_out_of_state       41 non-null float64
contributions_unknown            41 non-null float64
population                       41 non-null int64
pct_out_of_state                 41 non-null float64
out_of_state_per_thousand_pop    41 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 2.9+ KB


### Calculate out-of-state contributions by state in 2010

Calculate contributions by in-vs.-out-of-state status and group by state.

In [19]:
contribs_by_state_10 = contribs_10.groupby(["state", "in_out_state"])["contributions_10"].sum().reset_index()
contribs_by_state_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 3 columns):
state            116 non-null object
in_out_state     116 non-null object
contributions    116 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.8+ KB


In [20]:
contribs_by_state_10.head()

Unnamed: 0,state,in_out_state,contributions
0,AK,in-state,2423533.25
1,AK,out-of-state,110922.49
2,AK,unknown,9285.0
3,AL,in-state,5185371.17
4,AL,out-of-state,532085.12


Pivot dataframe to aggregate each state's data in a single row.

In [21]:
contribs_by_state_10 = pd.pivot_table(contribs_by_state_10, index=["state"], columns=["in_out_state"]).reset_index()
contribs_by_state_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
(state, )                        41 non-null object
(contributions, in-state)        41 non-null float64
(contributions, out-of-state)    41 non-null float64
(contributions, unknown)         34 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


It appears that some states have no contributions categorized as "unknown." Let's set those values equal to zero to be sure any calculations we run on them are correct.

In [22]:
contribs_by_state_10.fillna(0, inplace=True)
contribs_by_state_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
(state, )                        41 non-null object
(contributions, in-state)        41 non-null float64
(contributions, out-of-state)    41 non-null float64
(contributions, unknown)         41 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


Flatten the resulting dataframe's multi-index columns.

In [23]:
contribs_by_state_10.columns = ["_".join(column).replace("-","_").strip("_") for column in contribs_by_state_10.columns.values]
contribs_by_state_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
state                         41 non-null object
contributions_in_state        41 non-null float64
contributions_out_of_state    41 non-null float64
contributions_unknown         41 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.4+ KB


Calculate the proportion of out-of-state contributions.

In [26]:
contribs_by_state_10["pct_10_out_of_state_10"] = contribs_by_state_10["contributions_10_out_of_state"] / (contribs_by_state_10["contributions_10_in_state"] + contribs_by_state_10["contributions_10_out_of_state"] + contribs_by_state_10["contributions_10_unknown"])
contribs_by_state_10.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41 entries, 0 to 40
Data columns (total 8 columns):
state                            41 non-null object
fips                             41 non-null object
contributions_in_state           41 non-null float64
contributions_out_of_state       41 non-null float64
contributions_unknown            41 non-null float64
population                       41 non-null int64
pct_out_of_state                 41 non-null float64
out_of_state_per_thousand_pop    41 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 2.9+ KB


### Join the 2018, 2014 and 2010 contributions by state data and calculate out-of-state figures

In [None]:
list_of_contribs_by_state = [contribs_by_state_18, contribs_by_state_14, contribs_by_state_10]
contribs_by_state = reduce(lambda left, right: pd.merge(left, right, on="state"), list_of_contribs_by_state)
contribs_by_state

Calculate the change in out-of-state spending, in absolute and proportional terms.

In [None]:
contribs_by_state["change_from_14"] = contribs_by_state["contributions_18_out_of_state"] - contribs_by_state["contributions_14_out_of_state"]
contribs_by_state["change_from_10"] = contribs_by_state["contributions_18_out_of_state"] - contribs_by_state["contributions_10_out_of_state"]
contribs_by_state["pct_change_from_14"] = (contribs_by_state["contributions_18_out_of_state"] - contribs_by_state["contributions_14_out_of_state"]) / contribs_by_state["contributions_14_out_of_state"]
contribs_by_state["pct_change_from_10"] = (contribs_by_state["contributions_18_out_of_state"] - contribs_by_state["contributions_14_out_of_state"]) / contribs_by_state["contributions_14_out_of_state"]

### Calculate out-of-state contributions by candidate in 2018

In [None]:
contribs_by_candidate_18 = candidates_18.groupby(["candidate", "in_out_state"])["contributions"].sum().reset_index()
contribs_by_candidate_18.info()

In [30]:
contribs_by_candidate_18.head()

Unnamed: 0,in_out_state,contributions,state_district
0,in-state,135899.42,AK HOUSE DISTRICT 001
1,out-of-state,1350.0,AK HOUSE DISTRICT 001
2,in-state,20618.19,AK HOUSE DISTRICT 002
3,out-of-state,1801.0,AK HOUSE DISTRICT 002
4,in-state,1100.0,AK HOUSE DISTRICT 003
5,in-state,71984.34,AK HOUSE DISTRICT 004
6,out-of-state,3969.0,AK HOUSE DISTRICT 004
7,unknown,55.0,AK HOUSE DISTRICT 004
8,in-state,16708.9,AK HOUSE DISTRICT 005
9,out-of-state,20.0,AK HOUSE DISTRICT 005


Pivot dataframe to aggregate each district's data in a single row.

In [31]:
contribs_by_candidate_18 = pd.pivot_table(contribs_by_candidate_18, index=["candidate"], columns=["in_out_state"]).reset_index()
contribs_by_candidate_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3746 entries, 0 to 3745
Data columns (total 4 columns):
(state_district, )               3746 non-null object
(contributions, in-state)        3726 non-null float64
(contributions, out-of-state)    3339 non-null float64
(contributions, unknown)         1156 non-null float64
dtypes: float64(3), object(1)
memory usage: 117.1+ KB


Again, some records have no contributions for certain categories. Let's set those values equal to zero to be sure any calculations we run on them are correct.

In [32]:
contribs_by_candidate_18.fillna(0, inplace=True)
contribs_by_candidate_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3746 entries, 0 to 3745
Data columns (total 4 columns):
(state_district, )               3746 non-null object
(contributions, in-state)        3746 non-null float64
(contributions, out-of-state)    3746 non-null float64
(contributions, unknown)         3746 non-null float64
dtypes: float64(3), object(1)
memory usage: 117.1+ KB


Flatten the resulting dataframe's multi-index columns.

In [33]:
contribs_by_candidate_18.columns = ["_".join(column).replace("-","_").strip("_") for column in contribs_by_candidate_18.columns.values]
contribs_by_candidate_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3746 entries, 0 to 3745
Data columns (total 4 columns):
state_district                3746 non-null object
contributions_in_state        3746 non-null float64
contributions_out_of_state    3746 non-null float64
contributions_unknown         3746 non-null float64
dtypes: float64(3), object(1)
memory usage: 117.1+ KB


Calculate the proportion of in-vs.-out-of-state contributions.

In [34]:
contribs_by_candidate_18["pct_out_of_state"] = contribs_by_candidate_18["contributions_out_of_state"] / (contribs_by_candidate_18["contributions_in_state"] + contribs_by_candidate_18["contributions_out_of_state"] + contribs_by_candidate_18["contributions_unknown"])
contribs_by_candidate_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3746 entries, 0 to 3745
Data columns (total 5 columns):
state_district                3746 non-null object
contributions_in_state        3746 non-null float64
contributions_out_of_state    3746 non-null float64
contributions_unknown         3746 non-null float64
pct_out_of_state              3746 non-null float64
dtypes: float64(4), object(1)
memory usage: 146.4+ KB
