In [1]:
# Add Matplotlib inline magic command
%matplotlib inline

# Dpendencies and Setup
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Files to load
donor_data_to_load = "Resources/data_science_for_fundraising_donor_data.csv"
charity_data_to_load = "Resources/CharityML.csv"

In [3]:
charity_df = pd.read_csv(charity_data_to_load)
charity_df.head(10)

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
5,37,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
6,49,Private,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
8,31,Private,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
9,42,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [4]:
# Name of all columns
charity_df.columns

Index(['age', 'workclass', 'education_level', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [5]:
# List of data types
charity_df.dtypes

age                  int64
workclass           object
education_level     object
education-num      float64
marital-status      object
occupation          object
relationship        object
race                object
sex                 object
capital-gain       float64
capital-loss       float64
hours-per-week     float64
native-country      object
income              object
dtype: object

In [6]:
# Find null values
for column in charity_df.columns:
    print(f"Column {column} has {charity_df[column].isnull().sum()} nul values")

Column age has 0 nul values
Column workclass has 0 nul values
Column education_level has 0 nul values
Column education-num has 0 nul values
Column marital-status has 0 nul values
Column occupation has 0 nul values
Column relationship has 0 nul values
Column race has 0 nul values
Column sex has 0 nul values
Column capital-gain has 0 nul values
Column capital-loss has 0 nul values
Column hours-per-week has 0 nul values
Column native-country has 0 nul values
Column income has 0 nul values


In [7]:
# Drop unnecessary columns
charity_df.drop(columns=["native-country"], inplace=True)
charity_df.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,<=50K


In [8]:
# Drop unnecessary columns
charity_df.drop(columns=["education_level"], inplace=True)
charity_df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income
0,39,State-gov,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,<=50K
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,<=50K
2,38,Private,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,<=50K
3,53,Private,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,<=50K
4,28,Private,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,<=50K


In [9]:
# Transform capital gains
charity_df["capital-gain"] = charity_df["capital-gain"] / 100
charity_df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income
0,39,State-gov,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,21.74,0.0,40.0,<=50K
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,<=50K
2,38,Private,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,<=50K
3,53,Private,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,<=50K
4,28,Private,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,<=50K


In [10]:
# Transform String column
charity_binary_encoded_df = pd.get_dummies(charity_df, columns=["sex", "income"])
charity_binary_encoded_df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,State-gov,13.0,Never-married,Adm-clerical,Not-in-family,White,21.74,0.0,40.0,0,1,1,0
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,Exec-managerial,Husband,White,0.0,0.0,13.0,0,1,1,0
2,38,Private,9.0,Divorced,Handlers-cleaners,Not-in-family,White,0.0,0.0,40.0,0,1,1,0
3,53,Private,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,0.0,0.0,40.0,0,1,1,0
4,28,Private,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,0.0,0.0,40.0,1,0,1,0


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = charity_binary_encoded_df.copy()
df2.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,State-gov,13.0,Never-married,Adm-clerical,Not-in-family,White,21.74,0.0,40.0,0,1,1,0
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,Exec-managerial,Husband,White,0.0,0.0,13.0,0,1,1,0
2,38,Private,9.0,Divorced,Handlers-cleaners,Not-in-family,White,0.0,0.0,40.0,0,1,1,0
3,53,Private,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,0.0,0.0,40.0,0,1,1,0
4,28,Private,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,0.0,0.0,40.0,1,0,1,0


In [12]:
df2['race'] = le.fit_transform(df2['race'])
df2.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,State-gov,13.0,Never-married,Adm-clerical,Not-in-family,4,21.74,0.0,40.0,0,1,1,0
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,Exec-managerial,Husband,4,0.0,0.0,13.0,0,1,1,0
2,38,Private,9.0,Divorced,Handlers-cleaners,Not-in-family,4,0.0,0.0,40.0,0,1,1,0
3,53,Private,7.0,Married-civ-spouse,Handlers-cleaners,Husband,2,0.0,0.0,40.0,0,1,1,0
4,28,Private,13.0,Married-civ-spouse,Prof-specialty,Wife,2,0.0,0.0,40.0,1,0,1,0


In [13]:
df2['relationship'] = le.fit_transform(df2['relationship'])
df2.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,State-gov,13.0,Never-married,Adm-clerical,1,4,21.74,0.0,40.0,0,1,1,0
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,Exec-managerial,0,4,0.0,0.0,13.0,0,1,1,0
2,38,Private,9.0,Divorced,Handlers-cleaners,1,4,0.0,0.0,40.0,0,1,1,0
3,53,Private,7.0,Married-civ-spouse,Handlers-cleaners,0,2,0.0,0.0,40.0,0,1,1,0
4,28,Private,13.0,Married-civ-spouse,Prof-specialty,5,2,0.0,0.0,40.0,1,0,1,0


In [14]:
df2['occupation'] = le.fit_transform(df2['occupation'])
df2.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,State-gov,13.0,Never-married,0,1,4,21.74,0.0,40.0,0,1,1,0
1,50,Self-emp-not-inc,13.0,Married-civ-spouse,3,0,4,0.0,0.0,13.0,0,1,1,0
2,38,Private,9.0,Divorced,5,1,4,0.0,0.0,40.0,0,1,1,0
3,53,Private,7.0,Married-civ-spouse,5,0,2,0.0,0.0,40.0,0,1,1,0
4,28,Private,13.0,Married-civ-spouse,9,5,2,0.0,0.0,40.0,1,0,1,0


In [15]:
df2['marital-status'] = le.fit_transform(df2['marital-status'])
df2.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,State-gov,13.0,4,0,1,4,21.74,0.0,40.0,0,1,1,0
1,50,Self-emp-not-inc,13.0,2,3,0,4,0.0,0.0,13.0,0,1,1,0
2,38,Private,9.0,0,5,1,4,0.0,0.0,40.0,0,1,1,0
3,53,Private,7.0,2,5,0,2,0.0,0.0,40.0,0,1,1,0
4,28,Private,13.0,2,9,5,2,0.0,0.0,40.0,1,0,1,0


In [16]:
df2['workclass'] = le.fit_transform(df2['workclass'])
df2.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male,income_<=50K,income_>50K
0,39,5,13.0,4,0,1,4,21.74,0.0,40.0,0,1,1,0
1,50,4,13.0,2,3,0,4,0.0,0.0,13.0,0,1,1,0
2,38,2,9.0,0,5,1,4,0.0,0.0,40.0,0,1,1,0
3,53,2,7.0,2,5,0,2,0.0,0.0,40.0,0,1,1,0
4,28,2,13.0,2,9,5,2,0.0,0.0,40.0,1,0,1,0
