# Analyze GiveSendGo data

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import json
import numpy as np
import altair as alt
import os
import glob

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

## Read data

#### Find, read and concatenate the most recent json files

In [4]:
path_to_json = "increments/new"

json_pattern = os.path.join(path_to_json, "*.json")
file_list = glob.glob(json_pattern)

In [5]:
dfs = []
for file in file_list:
    with open(file) as f:
        json_data = json.loads(f.read())
        dfs.append(json_data)

In [6]:
all_data = []

for d in dfs:
    df = pd.DataFrame(d["returnData"]["donations"])
    all_data.append(df)

new_df = pd.concat(all_data)

In [7]:
len(new_df)

38070

#### Get the previously scraped data

In [8]:
with open("increments/data_increment_536789.json") as f:
    data_backup = json.load(f)

In [9]:
all_data_old = []

for d in data_backup:
    df = pd.DataFrame(d["returnData"]["donations"])
    all_data_old.append(df)

old_df = pd.concat(all_data_old)

#### Combine the two dataframes

In [10]:
df = pd.concat([old_df, new_df]).reset_index(drop=True)

#### Lose columns we don't need and clean up the frame

In [11]:
df.drop(
    [
        "donation_conversion_rate",
        "campaign_id",
        "lovecount",
        "likes",
        "donation_anonymous",
    ],
    axis=1,
    inplace=True,
)

In [12]:
df = df.drop_duplicates().copy()
df.donation_amount = df.donation_amount.astype(float)

---

## Toplines

#### Donations total

In [13]:
df.donation_amount.sum()

8214007.5

#### Average donation

In [14]:
df.donation_amount.mean().round(2)

90.67

#### How many donations?

In [15]:
df.donation_name.count()

90593

#### PCT of the goal? 

In [16]:
((df.donation_amount.sum() / 16000000) * 100).round(2)

51.34

---

## What's in the donation memo text?

#### Donations with specific amounts

In [17]:
df["mentions_14"] = df["donation_amount"] == 14

#### Donations comments with coded language or specific keywords?

In [18]:
df["mentions_mandate"] = df["donation_comment"].str.lower().str.contains("mandate")
df["mentions_holdtheline"] = (
    df["donation_comment"].str.lower().str.contains("hold the line")
)
df["mentions_brandon"] = df["donation_comment"].str.lower().str.contains("brandon")
df["mentions_wwg1wga"] = df["donation_comment"].str.lower().str.contains("wwg1wga")
df["mentions_wwg1wga"] = df["donation_comment"].str.lower().str.contains("wwg1wga")
df["mentions_trump"] = df["donation_comment"].str.lower().str.contains("trump")
df["mentions_honk"] = (
    df["donation_comment"].str.lower().str.contains("honkhonk|honk honk")
)
df["mentions_freedom"] = df["donation_comment"].str.lower().str.contains("freedom")
df["mentions_tyranny"] = df["donation_comment"].str.lower().str.contains("tyranny")
df["mentions_biden"] = df["donation_comment"].str.lower().str.contains("biden")
df["mentions_trudeau"] = df["donation_comment"].str.lower().str.contains("trudeau")

#### List of mention categories

In [19]:
boolean = [
    "mentions_mandate",
    "mentions_holdtheline",
    "mentions_brandon",
    "mentions_14",
    "mentions_wwg1wga",
    "mentions_trump",
    "mentions_trudeau",
    "mentions_biden",
    "mentions_honk",
    "mentions_freedom",
    "mentions_tyranny",
]

#### PCT mentioning a keyword

In [20]:
for b in boolean:
    print((df[b].value_counts(normalize=True) * 100).round(2))

False    98.02
True      1.98
Name: mentions_mandate, dtype: float64
False    97.63
True      2.37
Name: mentions_holdtheline, dtype: float64
False    99.68
True      0.32
Name: mentions_brandon, dtype: float64
False    99.95
True      0.05
Name: mentions_14, dtype: float64
False    99.91
True      0.09
Name: mentions_wwg1wga, dtype: float64
False    99.96
True      0.04
Name: mentions_trump, dtype: float64
False    98.44
True      1.56
Name: mentions_trudeau, dtype: float64
False    99.93
True      0.07
Name: mentions_biden, dtype: float64
False    98.79
True      1.21
Name: mentions_honk, dtype: float64
False    80.54
True     19.46
Name: mentions_freedom, dtype: float64
False    97.76
True      2.24
Name: mentions_tyranny, dtype: float64


---

## Dates

#### The dates are vague so categorize them

In [21]:
df.donation_date = (
    df.donation_date.str.replace("6 days ago", "2022-02-02", regex=False)
    .str.replace("5 days ago", "2022-02-03", regex=False)
    .str.replace("4 days ago", "2022-02-04", regex=False)
    .str.replace("3 days ago", "2022-02-05", regex=False)
    .str.replace("2 days ago", "2022-02-06", regex=False)
    .str.replace("1 days ago", "2022-02-07", regex=False)
)

#### The data have also been scraped incrementally, so we have to lump the more recent stuff in a larger bucket

In [22]:
df["donation_date"] = np.where(
    ~df["donation_date"].str.contains("2022", na=False),
    "2022-08-08 or later",
    df["donation_date"],
)

#### Summarize donations by our categories

In [23]:
dates = (
    df.groupby(["donation_date"])
    .agg({"donation_id": "count", "donation_amount": sum})
    .reset_index()
).rename(columns={"donation_id": "count", "donation_amount": "sum"})

In [24]:
dates

Unnamed: 0,donation_date,count,sum
0,2022-02-02,61,3348.0
1,2022-02-03,112,6651.5
2,2022-02-04,115,5424.0
3,2022-02-05,2965,311979.0
4,2022-02-06,17561,1923301.0
5,2022-02-07,32499,2793103.0
6,2022-08-08 or later,37280,3170201.0


#### Export the dates

In [25]:
dates.to_csv("data/processed/dates.csv", index=False)

---

## Distribution

In [26]:
len(df[df["donation_amount"] >= 5000])

40

In [27]:
df[df["donation_amount"] >= 5000].sort_values("donation_amount").head()

Unnamed: 0,donation_id,donation_amount,donation_comment,donation_name,donation_date,mentions_14,mentions_mandate,mentions_holdtheline,mentions_brandon,mentions_wwg1wga,mentions_trump,mentions_honk,mentions_freedom,mentions_tyranny,mentions_biden,mentions_trudeau
6602,470849,5000.0,"Thank you truckers, farmers!!",,2022-02-05,False,False,False,False,False,False,False,False,False,False,False
104800,541321,5000.0,Please hold the line. You are fighting for all of our freedoms.,Kelly Gave,2022-08-08 or later,False,False,True,False,False,False,False,True,False,False,False
103898,561191,5000.0,,Gian DelZotto,2022-08-08 or later,False,False,False,False,False,False,False,False,False,False,False
102610,572270,5000.0,Thank you to all you fighters for giving your time and energy to hold goverments accountable. Keep up the good fight.,swamp donkey oilfield,2022-08-08 or later,False,False,False,False,False,False,False,False,False,False,False
96921,542948,5000.0,"The government won't meet with you because this isn't about science. Criminals don't like to confess crimes, and corrupt government won't ever admit its true intent. Thank you for insisting on your freedom, the entire world stands behind you.",Breez,2022-08-08 or later,False,False,False,False,False,False,False,True,False,False,False


---

In [28]:
df["donation_name"] = df["donation_name"].str.strip().str.lower()

In [29]:
df[df["donation_name"].str.contains("processed but not")].iloc[0]

donation_id                                  488558
donation_amount                            215000.0
donation_comment                                   
donation_name           processed but not recorded.
donation_date                            2022-02-06
mentions_14                                   False
mentions_mandate                              False
mentions_holdtheline                          False
mentions_brandon                              False
mentions_wwg1wga                              False
mentions_trump                                False
mentions_honk                                 False
mentions_freedom                              False
mentions_tyranny                              False
mentions_biden                                False
mentions_trudeau                              False
Name: 24283, dtype: object

### Group to count and sum donations by donor name

In [30]:
df.groupby(["donation_name"]).agg(
    {"donation_amount": [np.size, np.sum]}
).reset_index().sort_values(("donation_amount", "sum"), ascending=False).head(20)

Unnamed: 0_level_0,donation_name,donation_amount,donation_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,size,sum
0,,45832,3784046.5
26264,processed but not recorded.,1,215000.0
32401,thomas m. siebel,1,90000.0
9397,easy kleen pressure systems ltd,1,75000.0
1741,anonymous,535,45606.0
10927,freedom,353,28622.0
13403,holden and carey rhodes,1,25000.0
34990,www the range langley com,1,18000.0
1083,american cryptocurrency compatriot,1,17760.0
11111,freedom lover,89,15576.0


In [31]:
### How does this compare to other campaigns

In [32]:
df.groupby(["donation_name"]).agg(
    {"donation_amount": [np.size, np.sum]}
).reset_index().sort_values(("donation_amount", "size"), ascending=False).head(20)

Unnamed: 0_level_0,donation_name,donation_amount,donation_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,size,sum
0,,45832,3784046.5
1741,anonymous,535,45606.0
10927,freedom,353,28622.0
17465,justin trudeau,119,8421.0
10967,freedom convoy,103,11658.0
11111,freedom lover,89,15576.0
30429,steve,78,4773.0
22954,mike,69,4799.0
7461,david,64,4024.0
16297,john,62,6209.0


In [33]:
# 100 biggest donations
# 100 first donations

In [34]:
df.sort_values("donation_amount", ascending=False).head(100).to_csv(
    "data/processed/100_largest_donations.csv", index=False
)

In [35]:
df.sort_values("donation_date", ascending=True).head(100).to_csv(
    "data/processed/100_earliest_donations.csv", index=False
)

In [36]:
df.sort_values("donation_date", ascending=True).to_csv(
    "data/processed/all_donations.csv", index=False
)

In [37]:
weird_entries = [
    481566,
    474249,
    492432,
    471986,
    507120,
    499386,
    498521,
    484448,
    493519,
    476071,
    521675,
    516745,
]

In [38]:
df[df["donation_id"].isin(weird_entries)]

Unnamed: 0,donation_id,donation_amount,donation_comment,donation_name,donation_date,mentions_14,mentions_mandate,mentions_holdtheline,mentions_brandon,mentions_wwg1wga,mentions_trump,mentions_honk,mentions_freedom,mentions_tyranny,mentions_biden,mentions_trudeau
7725,471986,100.0,Stand strong and hold the line truckers!,💗fromwi,2022-02-05,False,False,True,False,False,False,False,False,False,False,False
10002,474249,50.0,God Speed all you brave and wonderful souls,🚛 californians supporting freedom truckers 🚛,2022-02-06,False,False,False,False,False,False,False,False,False,False,False
11820,476071,50.0,Thanks!!,🇨🇦🇺🇸❤️,2022-02-06,False,False,False,False,False,False,False,False,False,False,False
17305,481566,25.0,May the Lord bless and keep you all safe.,🚛🚛🚛🚛🚛honkhonk,2022-02-06,False,False,False,False,False,False,False,False,False,False,False
20173,484448,49.0,Thank you for waking up the world!!! Keep fighting the good fight and then bring this down to the USA!,🇺🇲❤🇨🇦,2022-02-06,False,False,False,False,False,False,False,False,False,False,False
28169,492432,10.0,,🚛 🇺🇸 🇨🇦 🙏🏽,2022-02-06,False,False,False,False,False,False,False,False,False,False,False
29262,493519,10.0,Freedom will prevail. Don't let up,🇺🇸 🤝 🇨🇦,2022-02-07,False,False,False,False,False,False,False,True,False,False,False
34260,498521,10.0,"Thank you so much,please do not give up the fight.",🇺🇲donald trump🇺🇲,2022-02-07,False,False,False,False,False,False,False,False,False,False,False
35115,499386,50.0,Https://patriots.win\r\nHereistheevidence.com\r\nWarroom.org,🇺🇲trump won biden cheated🇨🇦,2022-02-07,False,False,False,False,False,False,False,False,False,False,False
42861,507120,20.0,GOD BLESS ALL OF YOU WHO STAND AGAINST TYRANNY. \n,🇺🇲🇨🇦,2022-02-07,False,False,False,False,False,False,False,False,True,False,False
