In [18]:
import pandas as pd
import json
import os
import numpy as np
from numpy.random import randint
from selenium import webdriver

In [12]:
file_path = "C:\\Users\\kazij\\Downloads\\data-download-pub78\\data-download-pub78.txt"
col_names = ['TAX_ID','NAME','CITY','STATE','COUNTRY','EXEMPT_CODE']
dtype = dict(zip(col_names, ['string','string','string','category','category','category']))
df = pd.read_csv(file_path, delimiter="|", names=col_names, dtype=dtype)

In [14]:
display(df)

Unnamed: 0,TAX_ID,NAME,CITY,STATE,COUNTRY,EXEMPT_CODE
0,000004101,South Lafourche Quarterback Club,Lockport,LA,United States,PF
1,000587764,Iglesia Bethesda Inc.,Lowell,MA,United States,PC
2,000635913,Ministerio Apostolico Jesucristo Es El Senor Inc.,Lawrence,MA,United States,PC
3,000765634,Mercy Chapel International,Mattapan,MA,United States,PC
4,000841363,Agape House of Prayer,Mattapan,MA,United States,PC
...,...,...,...,...,...,...
1231079,996089401,Toyo Sakumoto Charitable Tr,Honolulu,HI,United States,PF
1231080,998010224,Hawaii Foundation for the Blind,Ewa Beach,HI,United States,POF
1231081,999999009,Clipped Wings United Air Line Stewardess Alumn...,Pensacola,FL,United States,PC
1231082,999999010,Association of Fundraising Professionals,Arlington,VA,United States,PC


# EDA (exploratory data analysis)
---
#### Columns Know/Need-to-Know
- TAX_ID
    - digits reference area that processes the tax info
    - is there a relevant pattern when city and state info provided?
    - XX-XXXXXXX
- NAME
    - is it relevant for web scraping search input?
- CITY
    - relevant for similarity mapping?
    - distribution?
- STATE
    - relevant for similarity mapping?
    - distribution?
- COUNTRY
    - is it the same for all values?


In [57]:
# Manual searching EDA
# random sample values
sample_amt = 5
rand_arr = randint(0, len(df.index), size=sample_amt)
print([df.iloc[rand_arr[x],:] for x in range(sample_amt)])

[TAX_ID                                                 541440344
NAME           Brachytherapy Research and Educational Foundat...
CITY                                                 Springfield
STATE                                                         VA
COUNTRY                                            United States
EXEMPT_CODE                                                   PF
Name: 639427, dtype: object, TAX_ID                            810480612
NAME           Burton K Wheeler Center Inc.
CITY                                Bozeman
STATE                                    MT
COUNTRY                       United States
EXEMPT_CODE                              PC
Name: 807896, dtype: object, TAX_ID                                 473026523
NAME           Southern Juba Relief Organization
CITY                                      Peoria
STATE                                         AZ
COUNTRY                            United States
EXEMPT_CODE                               

# Manual EDA Notes
* every non profit downloaded has public IRS income tax form history as PDFs
* link webscraping possible?
* Guidestar.com 4800-9000 dollar API scraped IRS page
* example URL: https://apps.irs.gov/app/eos/detailsPage?ein=270829065&name=Movement%20for%20Language%20and%20Culture%20Inc.&city=New%20York&state=NY&countryAbbr=US&dba=&type=CHARITIES,%20DETERMINATIONLETTERS,%20COPYOFRETURNS&orgTags=CHARITIES&orgTags=DETERMINATIONLETTERS&orgTags=COPYOFRETURNS
* example URL 2: https://apps.irs.gov/app/eos/detailsPage?ein=277011084&name=1966%20Charitable%20Tr&city=Potomac&state=MD&countryAbbr=US&dba=&type=CHARITIES,%20DETERMINATIONLETTERS,%20COPYOFRETURNS&orgTags=CHARITIES&orgTags=DETERMINATIONLETTERS&orgTags=COPYOFRETURNS
* two different types of charities to categorize, internal and external (local community development, and international development aid)
---
#### URL regex
* (any spaces in name, city, state, replaced with '%20):
* https://apps.irs.gov/app/eos/detailsPage?
* ein = TAX_ID value
* &name= NAME
* &city = CITY
* &state = STATE 
* &countryAbbr=US
* &dba=&type=CHARITIES,%20DETERMINATIONLETTERS,%20COPYOFRETURNS&orgTags=CHARITIES&orgTags=DETERMINATIONLETTERS&orgTags=COPYOFRETURNS

In [76]:
# city
us_count = df["COUNTRY"].value_counts()[0]
other_countries = list(df["COUNTRY"].cat.categories)
other_countries.remove("United States")
print(other_countries)
other_count = df["COUNTRY"].value_counts()[1:]
print("US count: " + str(us_count))
print("Other countries count: " + str(sum(other_count)))
# print(us_count//sum(other_count), us_count, sum(other_count))
df.loc[df["COUNTRY"].isin(other_countries)] 

['AFGHANISTAN', 'ANTIGUA & BARBUDA', 'BERMUDA', 'CANADA', 'FRANCE', 'GUATEMALA', 'HAITI', 'IRAN', 'ISRAEL', 'JAPAN', 'MADAGASCAR', 'MALAWI', 'NAMIBIA', 'NETHERLANDS', 'NORWAY', 'ROMANIA', 'THE BAHAMAS', 'UNITED ARAB EMIRATES', 'UNITED KINGDOM', 'ZIMBABWE', 'AKROTIRI', 'AUSTRIA', 'BELIZE', 'EAST AFRICA', 'GEORGIA', 'INDIA', 'MEXICO', 'NICARAGUA', 'PHILIPPINES', 'REPUBLIC OF KOREA', 'SWITZERLAND', 'BARBADOS', 'BRITISH VIRGIN ISLANDS', 'BURKINA FASO', 'CHINA', 'COSTA RICA', 'DENMARK', 'GERMANY', 'HONG KONG', 'INDONESIA', 'KENYA', 'MOZAMBIQUE', 'NAURU', 'PERU', 'PORTUGAL', 'SUDAN', 'SWEDEN', 'THE GAMBIA', 'ALBANIA', 'AUSTRALIA', 'CAMEROON', 'GREECE', 'IRELAND', 'MOLDOVA', 'NEW CALEDONIA', 'RWANDA', 'SAUDI ARABIA', 'SIERRA LEONE', 'SOUTH AFRICA', 'TOGO', 'UGANDA', 'WESTERN SAHARA', 'ARMENIA', 'ITALY', 'LAOS', 'MOROCCO', 'ALBERTA', 'ARGENTINA', 'ASHMORE & CARTIER IS', 'CAPE VERDE', 'GUAM', 'GUINEA-BISSAU', 'JERSEY', 'LEBANON', 'MAURITIUS', 'PARAGUAY', 'POLAND', 'PUERTO RICO', 'SINGAPORE', 'S

Unnamed: 0,TAX_ID,NAME,CITY,STATE,COUNTRY,EXEMPT_CODE
1931,010530629,Portland Nordic,Portland Me,,AFGHANISTAN,FORGN
3508,010674736,Us-Japan Relationship Fund Inc.,Kawasaki Kanagawa,,JAPAN,FORGN
3546,010678163,Forest Hills Neighborhood Alliance Inc.,Washington Dc,,NAMIBIA,FORGN
13280,030356850,Kosa Communications Ltd.,Montreal Quebec,,CANADA,FORGN
14634,030474336,Think Tank Romania Inc.,Bucharest Sector,,ROMANIA,FORGN
...,...,...,...,...,...,...
1230846,990368170,Partners for Seed in Africa Fund,Local,,KENYA,"FORGN,PF"
1230848,990369475,Greece Debt Free Inc.,Athens,,GREECE,FORGN
1230859,990376332,Living Water Arts Foundation,Ab,,CANADA,FORGN
1230879,990383175,Arquetopia Foundation Inc.,Puebla Puebla,,MEXICO,FORGN


In [79]:
# exempt code analysis
exempt_count = df["EXEMPT_CODE"].value_counts()
print(exempt_count)

PC                      1062400
PF                       117393
SOUNK                     16186
EO                        14817
POF                        8074
SO                         5258
GROUP                      2554
EO,LODGE                   2328
FORGN                       691
UNKWN                       506
EO,GROUP,LODGE              367
SONFI                       230
EO,GROUP                    105
GROUP,SOUNK                  68
FORGN,PF                     61
FORGN,SOUNK                  18
GROUP,SO                     10
EO,FORGN                      5
EO,PF                         3
EO,FORGN,LODGE                2
GROUP,PF                      2
FORGN,POF                     2
EO,SOUNK                      1
EO,SO                         1
EO,FORGN,GROUP,LODGE          1
FORGN,SO                      1
Name: EXEMPT_CODE, dtype: int64


In [None]:
# icons images logos
# website, social media
# all will increase a number that in calculation increases money they earn
