In [18]:
# Importing modules that will be used in this notebook
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the Data

## Loading the First Dataset

In [19]:
# reading in data from using pandas read_csv function
from pandas import DataFrame


charity_df = pd.read_csv('https://www.irs.gov/pub/irs-soi/eo_ky.csv')

# displaying the top 5 rows of the newly created comparsion_df DataFrame to ensure it was loaded correctly
charity_df.head()

Unnamed: 0,EIN,NAME,ICO,STREET,CITY,STATE,ZIP,GROUP,SUBSECTION,AFFILIATION,...,ASSET_CD,INCOME_CD,FILING_REQ_CD,PF_FILING_REQ_CD,ACCT_PD,ASSET_AMT,INCOME_AMT,REVENUE_AMT,NTEE_CD,SORT_NAME
0,10356732,CARRIAGE MUSEUM OF AMERICA,,4075 IRON WORKS PKWY,LEXINGTON,KY,40511-8483,0,3,3,...,6,3,1,0,12,1476492.0,58555.0,57604.0,A540,AT THE KENTUCKY HORSE PARK
1,10551398,HIGHLANDS BAND ASSOCIATION INC,% STANLEY ROSS,PO BOX 75029,FORT THOMAS,KY,41075-0029,0,3,3,...,0,0,2,0,5,0.0,0.0,0.0,N60,
2,10554843,JEFFERSON COUNTY SEARCH DOG ASSOCIATION,% HOLLY HATFIELD,8004 SMYRNA PKWY,LOUISVILLE,KY,40228-1808,0,3,3,...,0,0,2,0,12,0.0,0.0,0.0,P80,
3,10557144,AUGUSTA ART GUILD,% PRESIDENT,116 MAIN ST,AUGUSTA,KY,41002-1035,0,3,3,...,0,0,2,0,12,0.0,0.0,0.0,A40,
4,10557631,KENTUCKY ALPACA ASSOCIATION INC,%MARTHA SANDERS,134 N LUCAS RD,GLASGOW,KY,42141-8778,0,5,3,...,0,0,2,0,12,0.0,0.0,0.0,K26,KY CLASSIC ALPACA SHOW


## Loading the Second Dataset

In [None]:
income_df = pd.read_csv('')

# Cleaning the Data

In [22]:
# dropping the columns I will not need for this project
charity_df = charity_df.drop(['ICO', 'ORGANIZATION', "SUBSECTION", "CLASSIFICATION", "PF_FILING_REQ_CD","FILING_REQ_CD", "ASSET_CD", "INCOME_CD"], axis=1)

#changing the Ruling Date column to a date
charity_df['RULING'] = pd.to_datetime(charity_df['RULING'])

# looking at the type of each column to make sure they are correct
charity_df.dtypes


EIN                       int64
NAME                     object
STREET                   object
CITY                     object
STATE                    object
ZIP                      object
GROUP                     int64
AFFILIATION               int64
RULING           datetime64[ns]
DEDUCTIBILITY             int64
FOUNDATION                int64
ACTIVITY                  int64
STATUS                    int64
TAX_PERIOD              float64
ACCT_PD                   int64
ASSET_AMT               float64
INCOME_AMT              float64
REVENUE_AMT             float64
NTEE_CD                  object
SORT_NAME                object
dtype: object

In [25]:
# adding a column for NTEE Common Codes using the column of specific NTEE Code
charity_df['NTEE_COMMON_CODE'] = charity_df.NTEE_CD.str[:1]

# Replacing the NTEE codes with the code description

# Making a dictionary of the NTEE Common Codes and their descriptions
Common_Codes = {
    "A" : "Arts & Culture",
    "B" : "Education",
    "C" : "Environment",
    "D" : "Animals",
    "E" : "Health",
    "F" : "Mental Health",
    "G" : "Diseases & Disorders",
    "H" : "Medical Research",
    "I" : "Legal Related",
    "J" : "Job Related",
    "K" : "Food & Nutrition",
    "L" : "Housing & Shelter",
    "M" : "Public Safety & Disaster Relief",
    "N" : "Sports & Leisure",
    "O" : "Youth Development",
    "P" : "Human Services",
    "Q" : "Foreign Affairs",
    "R" : "Civil Rights",
    "S" : "Community Improvement",
    "T" : "Philanthropy & Voluntarism",
    "U" : "Science & Technology Research",
    "V" : "Social Science Research",
    "W" : "Public, Society Benefit",
    "X" : "Religon",
    "Y" : "Memebership Based",
    "Z" : "Unknown"
    }

charity_df['NTEE_COMMON_CODE'].replace(Common_Codes, inplace=True)

Unnamed: 0,EIN,NAME,STREET,CITY,STATE,ZIP,GROUP,AFFILIATION,RULING,DEDUCTIBILITY,...,ACTIVITY,STATUS,TAX_PERIOD,ACCT_PD,ASSET_AMT,INCOME_AMT,REVENUE_AMT,NTEE_CD,SORT_NAME,NTEE_COMMON_CODE
0,10356732,CARRIAGE MUSEUM OF AMERICA,4075 IRON WORKS PKWY,LEXINGTON,KY,40511-8483,0,3,1970-01-01 00:00:00.000197808,1,...,149060000,1,202012.0,12,1476492.0,58555.0,57604.0,A540,AT THE KENTUCKY HORSE PARK,A
1,10551398,HIGHLANDS BAND ASSOCIATION INC,PO BOX 75029,FORT THOMAS,KY,41075-0029,0,3,1970-01-01 00:00:00.000200203,1,...,0,1,201905.0,5,0.0,0.0,0.0,N60,,N
2,10554843,JEFFERSON COUNTY SEARCH DOG ASSOCIATION,8004 SMYRNA PKWY,LOUISVILLE,KY,40228-1808,0,3,1970-01-01 00:00:00.000200204,1,...,0,1,202112.0,12,0.0,0.0,0.0,P80,,P
3,10557144,AUGUSTA ART GUILD,116 MAIN ST,AUGUSTA,KY,41002-1035,0,3,1970-01-01 00:00:00.000201407,1,...,0,1,202112.0,12,0.0,0.0,0.0,A40,,A
4,10557631,KENTUCKY ALPACA ASSOCIATION INC,134 N LUCAS RD,GLASGOW,KY,42141-8778,0,3,1970-01-01 00:00:00.000200204,2,...,0,1,202112.0,12,0.0,0.0,0.0,K26,KY CLASSIC ALPACA SHOW,K


In [34]:
# Splitting the Zip Column into 2 columns - one for the Zip and one for the +4

charity_df[['ZIP', 'ZIP+4']] = charity_df['ZIP'].str.split('-', 1, expand=True)
charity_df.head()

Unnamed: 0,EIN,NAME,STREET,CITY,STATE,ZIP,GROUP,AFFILIATION,RULING,DEDUCTIBILITY,...,STATUS,TAX_PERIOD,ACCT_PD,ASSET_AMT,INCOME_AMT,REVENUE_AMT,NTEE_CD,SORT_NAME,NTEE_COMMON_CODE,ZIP+4
0,10356732,CARRIAGE MUSEUM OF AMERICA,4075 IRON WORKS PKWY,LEXINGTON,KY,40511,0,3,1970-01-01 00:00:00.000197808,1,...,1,202012.0,12,1476492.0,58555.0,57604.0,A540,AT THE KENTUCKY HORSE PARK,Arts & Culture,8483
1,10551398,HIGHLANDS BAND ASSOCIATION INC,PO BOX 75029,FORT THOMAS,KY,41075,0,3,1970-01-01 00:00:00.000200203,1,...,1,201905.0,5,0.0,0.0,0.0,N60,,Sports & Leisure,29
2,10554843,JEFFERSON COUNTY SEARCH DOG ASSOCIATION,8004 SMYRNA PKWY,LOUISVILLE,KY,40228,0,3,1970-01-01 00:00:00.000200204,1,...,1,202112.0,12,0.0,0.0,0.0,P80,,Human Services,1808
3,10557144,AUGUSTA ART GUILD,116 MAIN ST,AUGUSTA,KY,41002,0,3,1970-01-01 00:00:00.000201407,1,...,1,202112.0,12,0.0,0.0,0.0,A40,,Arts & Culture,1035
4,10557631,KENTUCKY ALPACA ASSOCIATION INC,134 N LUCAS RD,GLASGOW,KY,42141,0,3,1970-01-01 00:00:00.000200204,2,...,1,202112.0,12,0.0,0.0,0.0,K26,KY CLASSIC ALPACA SHOW,Food & Nutrition,8778
