## READ in Files from GitHub

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import io

In [2]:
# Gift Data File Path to GitHub

gifturl = "https://raw.githubusercontent.com/NicoleWittlin/APRA-DataViz-Challenge/master/giving_data_table.csv"
download = requests.get(gifturl).content
giftdata = pd.read_csv(io.StringIO(download.decode("utf-8")))

In [3]:
# Bio Data File Path to GitHub

biourl = "https://raw.githubusercontent.com/NicoleWittlin/APRA-DataViz-Challenge/master/bio_data_table.csv"
download = requests.get(biourl).content
biodata = pd.read_csv(io.StringIO(download.decode("utf-8")))

In [4]:
# Engagement Data File Path to GitHub

engageurl = "https://raw.githubusercontent.com/NicoleWittlin/APRA-DataViz-Challenge/master/engagement_data_table.csv"
download = requests.get(engageurl).content
engagedata = pd.read_csv(io.StringIO(download.decode("utf-8")))

## Spot Check Data

In [5]:
giftdata.head()

Unnamed: 0,household ID,id,gift id,credit Type,gift amt,gift date
0,9662153,8494401,2916764,Soft-Credit,385,10/21/2016
1,2484641,5186919,2916801,Hard-Credit,401,10/21/2016
2,2484641,5929757,2916801,Soft-Credit,401,10/21/2016
3,6982155,7005004,2916813,Hard-Credit,420,10/21/2016
4,6982155,3789171,2916813,Soft-Credit,420,10/21/2016


In [6]:
giftdata.columns

Index(['household ID', 'id', 'gift id', 'credit Type', 'gift amt',
       'gift date'],
      dtype='object')

In [7]:
giftdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378001 entries, 0 to 378000
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   household ID  378001 non-null  int64 
 1   id            378001 non-null  int64 
 2   gift id       378001 non-null  int64 
 3   credit Type   378001 non-null  object
 4   gift amt      378001 non-null  int64 
 5   gift date     378001 non-null  object
dtypes: int64(4), object(2)
memory usage: 17.3+ MB


In [8]:
biodata.head()

Unnamed: 0,id,name,household_id,country,city,birthday,deceased,zip,state,lat,lon,capacity,capacity_source,race
0,1581317,"Patterson, Aeneas",4310723,United States,Agawam,1/1/1900,N,1001.0,MA,42.06,-72.61,>$1k,screening,Non-Hispanic white
1,9952781,"Page, Casie",9248960,United States,Agawam,10/20/1958,N,1001.0,MA,42.06,-72.61,$75k - $100k,screening,Non-Hispanic white
2,6170220,"Vasquez, James",9248960,United States,Agawam,3/10/1958,N,1001.0,MA,42.06,-72.61,$75k - $100k,screening,Non-Hispanic white
3,2012013,"Garcia, Dezmenn",4731003,United States,Amherst,7/20/1940,Y,1002.0,MA,42.37,-72.52,$75k - $100k,,Non-Hispanic white
4,1021063,"Riefstahl, Christopher",6094904,United States,Amherst,7/24/1982,N,1002.0,MA,42.37,-72.52,$75k - $100k,institutional,Non-Hispanic white


In [9]:
biodata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               100000 non-null  int64  
 1   name             100000 non-null  object 
 2   household_id     100000 non-null  int64  
 3   country          100000 non-null  object 
 4   city             100000 non-null  object 
 5   birthday         90000 non-null   object 
 6   deceased         90000 non-null   object 
 7   zip              90000 non-null   float64
 8   state            90000 non-null   object 
 9   lat              90000 non-null   float64
 10  lon              90000 non-null   float64
 11  capacity         90000 non-null   object 
 12  capacity_source  90000 non-null   object 
 13  race             100000 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 10.7+ MB


In [10]:
engagedata.head()

Unnamed: 0,id,last_contact,numer_of_contacts,gift_officer,event,volunteer,time_on_site,interests
0,1581317,,0.0,,Y,,432.0,"skiing,golf,reading,hunting/fishing"
1,2012013,,8.0,,Y,,,
2,1021063,,0.0,,N,,,hunting/fishing
3,2725629,,,,N,,119.0,
4,1880411,,0.0,,N,,,


In [11]:
engagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 100000 non-null  int64  
 1   last_contact       63634 non-null   object 
 2   numer_of_contacts  83801 non-null   float64
 3   gift_officer       3589 non-null    object 
 4   event              90002 non-null   object 
 5   volunteer          8969 non-null    float64
 6   time_on_site       20002 non-null   float64
 7   interests          63801 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 6.1+ MB


## Data Clean Up

In [12]:
# Deceased -- make all blankes N
# df['deceased'] = df['deceased'].fillna('N')
biodata['deceased'] = biodata['deceased'].fillna('N')

In [13]:
biodata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               100000 non-null  int64  
 1   name             100000 non-null  object 
 2   household_id     100000 non-null  int64  
 3   country          100000 non-null  object 
 4   city             100000 non-null  object 
 5   birthday         90000 non-null   object 
 6   deceased         100000 non-null  object 
 7   zip              90000 non-null   float64
 8   state            90000 non-null   object 
 9   lat              90000 non-null   float64
 10  lon              90000 non-null   float64
 11  capacity         90000 non-null   object 
 12  capacity_source  90000 non-null   object 
 13  race             100000 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 10.7+ MB


## MERGE Data Files: Bio + Engagement

In [None]:
bioengagemerge = pd.merge(biodata, engagedata, on="id", how="left")

In [None]:
bioengagemerge.head()

In [None]:
bioengagemerge.tail()

In [None]:
bioengagemerge.columns

In [None]:
bioengagemerge.info()

In [None]:
bioengagemerge.deceased.value_counts()

In [None]:
bioengagemerge.country.value_counts()

In [None]:
bioengagemerge.gift_officer.value_counts()

In [None]:
bioengagemerge.interests.value_counts()

In [None]:
bioengagemerge.describe()

In [None]:
from pandas_profiling import ProfileReport

In [None]:
profileBioEngage = ProfileReport(bioengagemerge, minimal=True)
profileBioEngage.to_file(output_file='BioEngageoutput.html')

## MERGE Data Files: + Giving Data

In [None]:
giftdata = giftdata.rename(columns={"ID": "id"}, errors="raise")

In [None]:
allmerge = pd.merge(bioengagemerge, giftdata, on="id", how="left")

In [None]:
allmerge.head()

In [None]:
allmerge.tail()

In [None]:
allmerge.info()

In [None]:
allmerge.describe()

In [None]:
profileAll = ProfileReport(allmerge, minimal=True)
profileAll.to_file(output_file='Alloutput.html')

In [None]:
## LOOKS LIKE THE $385 GIFT IS MISSING A HARD CREDIT PAIR
allmerge.loc[allmerge['id'] == 8494401]

In [None]:
allmerge.loc[allmerge['id'] == 7252527]

## Managed Donors with no gift and no contact in 2020

In [None]:
manageddonors = bioengagemerge.dropna(subset=['gift_officer'])

In [None]:
manageddonors.head()

In [None]:
manageddonors.tail()

In [None]:
manageddonors.info()

In [None]:
manageddonors.deceased.value_counts()

In [None]:
deceased = manageddonors[manageddonors['deceased'] == 'Y']

In [None]:
deceased.head()

In [None]:
deceased.to_csv('deceased.csv')

In [None]:
activemanage = manageddonors[manageddonors['deceased'] == 'N']

In [None]:
activemanage.head()

In [None]:
activemanage.info()

In [None]:
activemanage.deceased.value_counts()

In [None]:
activemanage.gift_officer.value_counts()

In [None]:
activemanage.last_contact.value_counts()

In [None]:
activemanage.to_csv('activemanage.csv')