# Classification of UK Charities
In this notebook, data from several sources has been used to classify UK charities

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import json
import pandas as pd
import networkx as nx
from numpy.core.numeric import NaN
with open('drive/My Drive/UK_Data/json/publicextract.charity_trustee.json', encoding="utf-8-sig") as f:
  data = json.load(f)
print(len(data))
print(json.dumps(data[0],indent=4))

942096
{
    "date_of_extract": "2022-01-11T00:00:00",
    "organisation_number": 521013,
    "registered_charity_number": 521013,
    "linked_charity_number": 0,
    "trustee_id": 23760,
    "trustee_name": "AUGHTON PARISH COUNCIL",
    "trustee_is_chair": false,
    "individual_or_organisation": "O",
    "trustee_date_of_appointment": null
}


- In the charity_trustee table we have all combinations of trustees and charities.
- Website https://register-of-charities.charitycommission.gov.uk/sector-data/sector-overview


In [3]:
members = {}
orgs = {}

for trustee in data:
    tid = trustee['trustee_id']
    oid = trustee['organisation_number']
    if(tid not in members):
        members[tid] = [trustee]
    else:
        members[tid].append(trustee)
    if(oid not in orgs):
        orgs[oid] = [trustee]
    else:
        orgs[oid].append(trustee)

print("Total Trustees : ",len(members))
print("Total Organizations: ",len(orgs))

Total Trustees :  851555
Total Organizations:  170190


In [4]:
with open('drive/My Drive/UK_Data/json/publicextract.charity.json', encoding="utf-8-sig") as f:
  charity_data = json.load(f)

print(len(charity_data))
print(json.dumps(charity_data[3],indent=4))

376413
{
    "date_of_extract": "2022-01-11T00:00:00",
    "organisation_number": 4,
    "registered_charity_number": 200028,
    "linked_charity_number": 2,
    "charity_name": "TOWN LANDS CHARITY FOR THE CHURCH",
    "charity_type": null,
    "charity_registration_status": "Removed",
    "date_of_registration": "1961-10-19T00:00:00",
    "date_of_removal": "1997-09-17T00:00:00",
    "charity_reporting_status": null,
    "latest_acc_fin_period_start_date": null,
    "latest_acc_fin_period_end_date": null,
    "latest_income": null,
    "latest_expenditure": null,
    "charity_contact_address1": null,
    "charity_contact_address2": null,
    "charity_contact_address3": null,
    "charity_contact_address4": null,
    "charity_contact_address5": null,
    "charity_contact_postcode": null,
    "charity_contact_phone": null,
    "charity_contact_email": null,
    "charity_contact_web": null,
    "charity_company_registration_number": null,
    "charity_insolvent": false,
    "charity_in_a

In [5]:
total_registered_main = 0
total_registered_linked = 0
total_removed = 0
nc = 0
for i in charity_data:
  if(i['charity_registration_status']!="Removed"):
    if(i['linked_charity_number']!=0):
      total_registered_linked+=1
    else:
      total_registered_main+=1
      if(i['charity_activities']==None):
        nc+=1
  else:
    total_removed+=1

print("Total Registered Main : ",total_registered_main)
print("Total Registered Linked: ",total_registered_linked)
print("Total Removed : ",total_removed)
print(nc)

Total Registered Main :  170570
Total Registered Linked:  15166
Total Removed :  190677
6029


In [6]:
for i in charity_data:
  if(i['registered_charity_number']==200027):
    print(i)
# print(charity_data[0])


{'date_of_extract': '2022-01-11T00:00:00', 'organisation_number': 1, 'registered_charity_number': 200027, 'linked_charity_number': 1, 'charity_name': 'POTTERNE MISSION ROOM AND TRUST', 'charity_type': None, 'charity_registration_status': 'Removed', 'date_of_registration': '1962-05-17T00:00:00', 'date_of_removal': '2014-04-16T00:00:00', 'charity_reporting_status': None, 'latest_acc_fin_period_start_date': None, 'latest_acc_fin_period_end_date': None, 'latest_income': None, 'latest_expenditure': None, 'charity_contact_address1': None, 'charity_contact_address2': None, 'charity_contact_address3': None, 'charity_contact_address4': None, 'charity_contact_address5': None, 'charity_contact_postcode': None, 'charity_contact_phone': None, 'charity_contact_email': None, 'charity_contact_web': None, 'charity_company_registration_number': None, 'charity_insolvent': False, 'charity_in_administration': False, 'charity_previously_excepted': None, 'charity_is_cdf_or_cif': None, 'charity_is_cio': None,

*   For charity classification we have charity_activities field in the charity table which has a description(one line summary) of how the charity spends its money, might be used for classification. This field is not None only for the parent organisations, i.e. orgs with linked charity no. = 0. 
*   charity_governing_document table has a charitable_objects field which is a paragraph desscription. Might be used for charity classification.
*   there seems to be a discrepency, in the charity table - charity no. 200027 points to POTTERNE MISSION ROOM AND TRUST but while searched online on the website 200027 points to RURAL MINISTRIES => RESOLVED - the latter seems to be the main organisation and the former is a linked charity which was removed

*   there are many cases where a single charity is classified into various types i.e. it has various "what" descriptions in the charity classification table. same goes for other fields - "WHO" and "HOW". Therefore, the provided classification will create ambiguity if directly used.

# Charities with income >  10 Million

In [7]:
#Lets try figuring out charities with income over 10 Million Pounds
charity10m = []
count10m = 0
for charity in charity_data:
  if(charity['charity_registration_status']!="Removed" and 
     charity['latest_income']!= None and 
     charity['latest_income'] >= 10000000):
    count10m+=1
    charity10m.append(charity)
print(count10m)

1329


In [8]:
#Sort charity10m in descending order of income
n = len(charity10m)
for i in range(n-1):
    for j in range(0, n-i-1):
        if charity10m[j]['latest_income'] < charity10m[j + 1]['latest_income'] :
            charity10m[j], charity10m[j + 1] = charity10m[j + 1], charity10m[j]

# charity10m[:10]
for i in charity10m[0:10]:
  print(i['charity_name'],"|",i['registered_charity_number'],"|",i['organisation_number'])

THE ARTS COUNCIL OF ENGLAND | 1036733 | 1036733
LLOYD'S REGISTER FOUNDATION | 1145988 | 5025687
THE BRITISH COUNCIL | 209131 | 209131
INTERNATIONAL FINANCE FACILITY FOR IMMUNISATION COMPANY | 1115413 | 4016873
THE CHARITIES AID FOUNDATION | 268369 | 268369
SAVE THE CHILDREN INTERNATIONAL | 1076822 | 3961959
NUFFIELD HEALTH | 205533 | 205533
CANCER RESEARCH UK | 1089464 | 3987102
CARDIFF UNIVERSITY | 1136855 | 5010420
UNITED CHURCH SCHOOLS FOUNDATION LTD | 313999 | 313999


In [9]:
print(json.dumps(charity10m[0],indent=3))

{
   "date_of_extract": "2022-01-11T00:00:00",
   "organisation_number": 1036733,
   "registered_charity_number": 1036733,
   "linked_charity_number": 0,
   "charity_name": "THE ARTS COUNCIL OF ENGLAND",
   "charity_type": "Other",
   "charity_registration_status": "Registered",
   "date_of_registration": "1994-04-19T00:00:00",
   "date_of_removal": null,
   "charity_reporting_status": "Submission Received",
   "latest_acc_fin_period_start_date": "2020-04-01T00:00:00",
   "latest_acc_fin_period_end_date": "2021-03-31T00:00:00",
   "latest_income": 1488506017.0,
   "latest_expenditure": 1419616010.0,
   "charity_contact_address1": "21 Bloomsbury Street",
   "charity_contact_address2": "London",
   "charity_contact_address3": null,
   "charity_contact_address4": null,
   "charity_contact_address5": null,
   "charity_contact_postcode": "WC1B 3HF",
   "charity_contact_phone": "08453006200",
   "charity_contact_email": "enquiries@artscouncil.org.uk",
   "charity_contact_web": "www.artscounc

In [10]:
# create a dict for accessing charity information through organisation number
charity = {}
for ch in charity_data:
    charity[ch['organisation_number']] = ch
# charity

In [11]:
with open('drive/My Drive/UK_Data/json/publicextract.charity_governing_document.json', encoding="utf-8-sig") as f:
  charity_governing_document = json.load(f)

print(len(charity_governing_document))

376413


In [12]:
# create a dict for accessing governing document info using organisation number
charity_governing_document_data = {}
for i in charity_governing_document:
  charity_governing_document_data[i['organisation_number']] = i

In [13]:
# check if all charities have data in the charitable object column
count = 0
c2=0
empty_objects = []
for i in charity_governing_document:
  if(i['charitable_objects'] == None):
    empty_objects.append(i)
    # print(i)
    if(i['linked_charity_number']!=0):
      c2+=1
    else:
      count+=1

print(count)
print(c2)

13
27


There are 27 Parent orgs and 13 linked charities in charity_governing_document table which have empty chartiable objects column.

In [14]:
# check how many charities with empty charitable objects are still registered
registered = 0
removed = 0
for i in empty_objects:
  if(charity[i['organisation_number']]['charity_registration_status'] == "Removed"):
    removed+=1
  else:
    registered+=1
    # print(i)

print("Registered = ",registered)
print("Removed = ",removed)

Registered =  24
Removed =  16


Out of 40 (13+27) charities that have empty charitable objects, 24 are still registered and only 16 are removed. However we can still simply ignore these charities since they only contribute to an extremely small fraction of all charities in the dataset.

In [15]:
charity_classification = pd.read_csv("drive/My Drive/UK_Data/text/publicextract.charity_classification.txt",sep="\t")
# refer to the data definition file on the website to understand detailed description of each table and the columns
print(len(charity_classification))
%load_ext google.colab.data_table
charity_classification.head(10)

1602240


Unnamed: 0,date_of_extract,organisation_number,registered_charity_number,linked_charity_number,classification_code,classification_type,classification_description
0,2022-01-11 00:00:00.0000000,200001,200001,0,101,What,General Charitable Purposes
1,2022-01-11 00:00:00.0000000,200001,200001,0,102,What,Education/training
2,2022-01-11 00:00:00.0000000,200001,200001,0,109,What,Arts/culture/heritage/science
3,2022-01-11 00:00:00.0000000,200001,200001,0,112,What,Environment/conservation/heritage
4,2022-01-11 00:00:00.0000000,200001,200001,0,201,Who,Children/young People
5,2022-01-11 00:00:00.0000000,200001,200001,0,301,How,Makes Grants To Individuals
6,2022-01-11 00:00:00.0000000,200001,200001,0,302,How,Makes Grants To Organisations
7,2022-01-11 00:00:00.0000000,200002,200002,0,102,What,Education/training
8,2022-01-11 00:00:00.0000000,200002,200002,0,105,What,The Prevention Or Relief Of Poverty
9,2022-01-11 00:00:00.0000000,200002,200002,0,201,Who,Children/young People


In [16]:
from collections import Counter
Counter(charity_classification['classification_description'])

Counter({'Accommodation/housing': 12403,
         'Acts As An Umbrella Or Resource Body': 22142,
         'Amateur Sport': 38753,
         'Animals': 5927,
         'Armed Forces/emergency Service Efficiency': 1280,
         'Arts/culture/heritage/science': 42296,
         'Children/young People': 137840,
         'Disability': 37201,
         'Economic/community Development/employment': 32546,
         'Education/training': 127657,
         'Elderly/old People': 71150,
         'Environment/conservation/heritage': 26619,
         'General Charitable Purposes': 79955,
         'Human Rights/religious Or Racial Harmony/equality Or Diversity': 7261,
         'Makes Grants To Individuals': 50204,
         'Makes Grants To Organisations': 69375,
         'Other Charitable Activities': 32804,
         'Other Charitable Purposes': 16954,
         'Other Charities Or Voluntary Bodies': 57079,
         'Other Defined Groups': 41642,
         'Overseas Aid/famine Relief': 15070,
         'Peopl

In [17]:
# we are only concerned with the descriptions corresponding to 'What' classification type
filtered = charity_classification[charity_classification['classification_type'] == 'What']
print(filtered.shape)
filtered.head(10)

(599055, 7)


Unnamed: 0,date_of_extract,organisation_number,registered_charity_number,linked_charity_number,classification_code,classification_type,classification_description
0,2022-01-11 00:00:00.0000000,200001,200001,0,101,What,General Charitable Purposes
1,2022-01-11 00:00:00.0000000,200001,200001,0,102,What,Education/training
2,2022-01-11 00:00:00.0000000,200001,200001,0,109,What,Arts/culture/heritage/science
3,2022-01-11 00:00:00.0000000,200001,200001,0,112,What,Environment/conservation/heritage
7,2022-01-11 00:00:00.0000000,200002,200002,0,102,What,Education/training
8,2022-01-11 00:00:00.0000000,200002,200002,0,105,What,The Prevention Or Relief Of Poverty
17,2022-01-11 00:00:00.0000000,200009,200009,0,101,What,General Charitable Purposes
18,2022-01-11 00:00:00.0000000,200009,200009,0,102,What,Education/training
19,2022-01-11 00:00:00.0000000,200009,200009,0,103,What,The Advancement Of Health Or Saving Of Lives
22,2022-01-11 00:00:00.0000000,200012,200012,0,103,What,The Advancement Of Health Or Saving Of Lives


In [18]:
Counter(filtered['classification_description'])

Counter({'Accommodation/housing': 12403,
         'Amateur Sport': 38753,
         'Animals': 5927,
         'Armed Forces/emergency Service Efficiency': 1280,
         'Arts/culture/heritage/science': 42296,
         'Disability': 37201,
         'Economic/community Development/employment': 32546,
         'Education/training': 127657,
         'Environment/conservation/heritage': 26619,
         'General Charitable Purposes': 79955,
         'Human Rights/religious Or Racial Harmony/equality Or Diversity': 7261,
         'Other Charitable Purposes': 16954,
         'Overseas Aid/famine Relief': 15070,
         'Recreation': 18213,
         'Religious Activities': 45985,
         'The Advancement Of Health Or Saving Of Lives': 42537,
         'The Prevention Or Relief Of Poverty': 48398})

In [19]:
# count the number of classifications for each organisation
classification_count = Counter(filtered['organisation_number'])
pc=0
for i in classification_count.items():
  print(i)
  pc+=1
  if pc==10:
    break

(200001, 4)
(200002, 2)
(200009, 3)
(200012, 3)
(200014, 1)
(200017, 1)
(200023, 3)
(200024, 2)
(200027, 1)
(200032, 2)


In [20]:
# we only want to work with charities that are still registered
ar = []
for i in charity_data:
  if(i['charity_registration_status']!="Removed"):
    ar.append(i['organisation_number'])
    
len(set(ar))

185736

In [21]:
# check the number of charities from registered charities that have a classification provided in the classification table
final = set(ar).intersection(set(classification_count.keys()))
len(final)

170390

In [22]:
# counting the number of charities having n classifications
count_num_type = {}
for i in classification_count:
  if(i in final):
    idx = classification_count[i]
    if(idx in count_num_type):
      count_num_type[idx]+=1
    else:
      count_num_type[idx] = 1
  
print(sum(count_num_type.values()))
count_num_type

170390


{1: 71916,
 2: 37278,
 3: 23460,
 4: 14493,
 5: 9097,
 6: 5523,
 7: 3483,
 8: 2119,
 9: 1237,
 10: 776,
 11: 433,
 12: 288,
 13: 150,
 14: 71,
 15: 29,
 16: 21,
 17: 16}

- a lot of charities (71916) have just 1 classification
- around 100,000 charities are classified into more than 1 category

In [23]:
# lets work with charities that have a unique classification - classified only into a single category
unique_category = []
for i in classification_count:
  if(classification_count[i]==1 and (i in final) ):
    unique_category.append(i)

unique_category[:10]

[200017,
 200027,
 200040,
 200044,
 200048,
 200049,
 200055,
 200076,
 200079,
 200097]

In [24]:
# filtering data for charities with unique category
filtered_single = filtered[filtered['organisation_number'].isin(unique_category) ]
filtered_single.shape

(71916, 7)

In [25]:
# view classification of charities, it will not include overlapping results since each charity is only assigned a single category
Counter(filtered_single['classification_description'])

Counter({'Accommodation/housing': 1692,
         'Amateur Sport': 3178,
         'Animals': 1183,
         'Armed Forces/emergency Service Efficiency': 96,
         'Arts/culture/heritage/science': 3589,
         'Disability': 2616,
         'Economic/community Development/employment': 562,
         'Education/training': 21003,
         'Environment/conservation/heritage': 2376,
         'General Charitable Purposes': 12207,
         'Human Rights/religious Or Racial Harmony/equality Or Diversity': 198,
         'Other Charitable Purposes': 2284,
         'Overseas Aid/famine Relief': 88,
         'Recreation': 1088,
         'Religious Activities': 12871,
         'The Advancement Of Health Or Saving Of Lives': 3457,
         'The Prevention Or Relief Of Poverty': 3428})

# Non Profit Classifier
- Mapping UK charities to US charity classification
- https://github.com/ma-ji/npo_classifier
- based on NTEE classification - https://en.wikipedia.org/wiki/National_Taxonomy_of_Exempt_Entities#:~:text=The%20National%20Taxonomy%20of%20Exempt%20Entities%20%28NTEE%29%20is,case%20when%20the%20organization%20is%20recognized%20as%20tax-exempt
- NTEE code structure - https://nccs.urban.org/project/national-taxonomy-exempt-entities-ntee-codes#overview


In [26]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 61.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created whe

In [27]:
import requests
exec(requests.get('https://raw.githubusercontent.com/ma-ji/npo_classifier/master/API/npoclass.py').text)

In [28]:
!pip3 install pickle5
import pickle5 as pickle

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[?25l[K     |█▎                              | 10 kB 15.7 MB/s eta 0:00:01[K     |██▋                             | 20 kB 5.2 MB/s eta 0:00:01[K     |███▉                            | 30 kB 4.4 MB/s eta 0:00:01[K     |█████▏                          | 40 kB 4.3 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 3.6 MB/s eta 0:00:01[K     |███████▊                        | 61 kB 4.3 MB/s eta 0:00:01[K     |█████████                       | 71 kB 4.4 MB/s eta 0:00:01[K     |██████████▎                     | 81 kB 5.0 MB/s eta 0:00:01[K     |███████████▌                    | 92 kB 4.9 MB/s eta 0:00:01[K     |████████████▉                   | 102 kB 4.8 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 4.8 MB/s eta 0:00:01[K     |███████████████▍                | 122 kB 4.8 MB/s eta 0:00:01[K     |████████████████▋            

- refer to the documentation on github website to learn more about using the ML classifier API
- API => npoclass(inputs, gpu_core=True, model_path=None, ntee_type='bc', n_jobs=4, backend='multiprocessing', batch_size_dl=64, verbose=1)

In [29]:
# testing out the classifier
# make sure you have uploaded the model data to your drive so that it can be accessed by the API
out = npoclass("helping poor individuals",
               True,
               "drive/My Drive/UK_Data/npoclass_model_bc/",
               'bc')
print(out)

No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:02<00:00,  2.24s/it]

[{'recommended': 'V', 'confidence': 'high (>=.99)', 'probabilities': {'I': 0.09410640597343445, 'II': 0.3920310139656067, 'III': 0.024979978799819946, 'IV': 0.8642263412475586, 'IX': 0.04580545425415039, 'V': 0.9937044978141785, 'VI': 0.3575180470943451, 'VII': 0.5676623582839966, 'VIII': 0.5914807319641113}}]





It takes more time than expected to classify a single charity by the API, so it might take a huge amount of time if we want to classify all 170k charities this way.

In [30]:
category_map = {"I": "Arts, Culture, and Humanities",
                "II": "Education",
                "III": "Environment and Animals",
                "IV": "Health",
                "V": "Human Services",
                "VI": "International, Foreign Affairs",
                "VII": "Public, Societal Benefit",
                "VIII": "Religion Related",
                "IX": "Mutual/Membership Benefit",
                "X": "Unknown, Unclassified"}

In [31]:
# create a new dataframe to store the classification results from the ML classifier
# more useful columns can be added to this dataframe if required
classification_us_data = pd.DataFrame({'organisation_number': pd.Series(dtype='int'),
                                       'registered_charity_number': pd.Series(dtype='int'),
                                       'charity_name':pd.Series(dtype='str'),
                                       'classification_code_us': pd.Series(dtype='str'),
                                       'classification_description_us':pd.Series(dtype='str'),
                                       'confidence':pd.Series(dtype='float')})

In [32]:
# we will be using this 'final' list which contains the organisaiton numbers of all registered charities
# we will be using the description from the charity activities column to be given as input to the classifier
parent_list = list(final)
cc = 0
compression_opts = dict(method='zip',
                        archive_name='classification_us_data.csv')

# a list of charity descriptions can be given as input to the API, which will save time in the overall classification due to API response time
# we basically want to iterate over all the charities in the parent_list and sent theit activity decriptions as inputs to the API
# in the code chunk below classification has been done in groups of 20 to save time and to ensure that no data is lost due to network issue
# if for some reason the notebook disconnects or the code produces some error then the results fetched till that point will all be save in the output file
# in case if the connection is lost and for loop is broken then it is best to first download the output file from the notebook environment
# and then read in the dataframe from the output file and then start the loop again from the point where it was broken
# make sure to modify the start and end range and the step size of the for loop according the the requirement

CHUNK_SIZE = 20 # define the chunk size - no. of charities to be processed together

for i in range (630,650,CHUNK_SIZE):
  # separating out 20 ids
  chunk_list = parent_list[i:i+CHUNK_SIZE]
  print(chunk_list)
  #  input list will contains descriptions for the corresponding 20 organisations
  input_list = []
  for j in chunk_list:
    if(charity[j]['charity_activities']!=None):
      input_list.append(charity[j]['charity_activities'])
  # get the prediction from the API
  pred = npoclass(input_list,
              True,
              "drive/My Drive/UK_Data/npoclass_model_bc/",
              'bc',
              batch_size_dl=128)
  # if the charity activities column is empty for a charity then simply add its classification as unknown/unclassified
  for jj in range(len(chunk_list)):
    if(charity[chunk_list[jj]]['charity_activities']==None):
      pred.insert(jj,{"recommended":"X"})
  
  # insert the data for all charities in the chunk list to the main dataframe
  for k in range (len(pred)):
    org_num = chunk_list[k]
    element = pred[k]
    classification_us_data = classification_us_data.append({'organisation_number': int(org_num), 
                                  'registered_charity_number': int(charity[org_num]['registered_charity_number']),
                                  'charity_name':charity[org_num]['charity_name'],
                                  'classification_code_us':element['recommended'],
                                  'classification_description_us':category_map[element['recommended']],
                                  'confidence':0 if element['recommended']=='X' else round(element['probabilities'][element['recommended']],2)},
                                  ignore_index=True)
  
  # save the dataframe at every step so that no progress is lost, note that the code will simply keep overwriting the output file
  classification_us_data.to_csv('classification_us_data.zip', index=False,
          compression=compression_opts)




[524965, 1049252, 524967, 1049253, 1049257, 1049254, 524971, 1049259, 524973, 1049261, 524975, 1049265, 524978, 524979, 524980, 1049266, 5002040, 524984, 524985, 524986]
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...


[Parallel(n_jobs=4)]: Using backend MultiprocessingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    0.3s finished


Predicting categories ...


100%|██████████| 1/1 [00:42<00:00, 42.35s/it]


In [33]:
classification_us_data[:10]

Unnamed: 0,organisation_number,registered_charity_number,charity_name,classification_code_us,classification_description_us,confidence
0,524965,524965,ABERGAVENNY DIVISION GUIDE ASSOCIATION,V,Human Services,1.0
1,1049252,1049252,CHARTERHOUSE CLUB,II,Education,0.97
2,524967,524967,ABERGAVENNY SCOUT GROUP,V,Human Services,1.0
3,1049253,1049253,SOUTH WOOTTON GRANT MAINTAINED JUNIOR SCHOOL A...,VII,"Public, Societal Benefit",0.99
4,1049257,1049257,ST GABRIELS MEDICAL CENTRE CHARITABLE TRUST,IV,Health,0.99
5,1049254,1049254,U3A - BANSTEAD AREA,VII,"Public, Societal Benefit",0.91
6,524971,524971,1ST CEFN FFOREST SCOUT GROUP,V,Human Services,1.0
7,1049259,1049259,NK THEATRE ARTS,I,"Arts, Culture, and Humanities",1.0
8,524973,524973,1ST CAERLEON SCOUT GROUP,V,Human Services,1.0
9,1049261,1049261,THE JOHN DARLOW MEDICAL EDUCATION FUND,V,Human Services,0.99


In [34]:
# predict the classification using charitable object instead of charity activities
# this can be integrated in the block above as well
classification_us_data1 = classification_us_data
activities = []
cobjects = []
codes = []
descp = []
confd = []
ct = 0
for i in classification_us_data1["organisation_number"]:
  print(ct)
  obj = charity_governing_document_data[i]["charitable_objects"]
  cobjects.append(obj)
  activities.append(charity[i]["charity_activities"])
  if(obj==None):
    codes.append("X")
    descp.append("Unknown, Unclassified")
    confd.append(0)
  else:
    pred = npoclass(obj,
              True,
              "drive/My Drive/UK_Data/npoclass_model_bc/",
              'bc',
              batch_size_dl=128)[0]
    codes.append(pred['recommended'])
    descp.append(category_map[pred['recommended']])
    confd.append(round(pred['probabilities'][pred['recommended']],2))
  ct+=1



0
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.90s/it]


1
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


2
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


3
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


4
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.90s/it]


5
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.94s/it]


6
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


7
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


8
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.94s/it]


9
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.90s/it]


10
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.86s/it]


11
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


12
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


13
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


14
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:02<00:00,  2.07s/it]


15
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:02<00:00,  2.03s/it]


16
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


17
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


18
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.92s/it]


19
No GPU acceleration available or gpu_core=False, using CPU.
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


In [35]:
# rename a few columns so that the data from previous data frame inclusing classification using charity activities can be incorporated
classification_us_data1 = classification_us_data1.rename(columns={"classification_code_us":'classification_code1',
                                        "classification_description_us":'classification_description1',
                                        "confidence":"confidence1"})
classification_us_data1['charity_activities'] = activities
classification_us_data1['charitable_objects'] = cobjects
classification_us_data1['classification_code2'] = codes
classification_us_data1['classification_description2'] = descp
classification_us_data1['confidence2'] = confd
classification_us_data1.head()

Unnamed: 0,organisation_number,registered_charity_number,charity_name,classification_code1,classification_description1,confidence1,charity_activities,charitable_objects,classification_code2,classification_description2,confidence2
0,524965,524965,ABERGAVENNY DIVISION GUIDE ASSOCIATION,V,Human Services,1.0,Charitable objects of the Guide Association as...,EDUCATING GIRLS AND YOUNG WOMEN TO HELP THEM D...,V,Human Services,0.99
1,1049252,1049252,CHARTERHOUSE CLUB,II,Education,0.97,To provide an educational facility for pupils ...,TO PROVIDE OR ASSIST IN THE PROVISION OF FACIL...,II,Education,1.0
2,524967,524967,ABERGAVENNY SCOUT GROUP,V,Human Services,1.0,Our charity is under the umbrella of the Scout...,ABERGAVENNY SCOUT GROUP,V,Human Services,1.0
3,1049253,1049253,SOUTH WOOTTON GRANT MAINTAINED JUNIOR SCHOOL A...,VII,"Public, Societal Benefit",0.99,fundraising,TO ADVANCE THE EDUCATION OF THE PUPILS IN THE ...,II,Education,1.0
4,1049257,1049257,ST GABRIELS MEDICAL CENTRE CHARITABLE TRUST,IV,Health,0.99,"General practice, providing services to the co...",THE RELIEF OF SICKNESS AND THE PRESERVATION AN...,IV,Health,1.0


In [36]:
# save the data frame to csv
classification_us_data1.to_csv('classification_us_data_WHOLE.zip', 
                                       index=False,
                                       compression=compression_opts)

In [37]:
# count how many times different category predicitons were given for charity activity and charitable object based classification
dc=0 # difference count
for index, row in classification_us_data1.iterrows():
  if(row['classification_code1'] !=row['classification_code2']):
    dc+=1
print(dc)


7


This might not be an appropriate classification for us since it is not really producing same output based on charity activity and charitable object. Moreover, it does take some time to get the classification from the API so it might take a lot of time to classify all the charities. This is something that can be explored more in future but for now the focus is being shifted to a new classification found below.

### Classification by charityclassification.org.uk - UKCAT - Rules Based Classification
- src - https://charityclassification.org.uk
- refer to the information provided on classification on the mentioned website
- Classification data is already provided, simply download the CSV file from the website and upload it to drive for accessibility.

In [73]:
classification = pd.read_csv('drive/My Drive/UK_Data/Classification/charities_active-ukcat.csv')
classification.head()

Unnamed: 0,org_id,ukcat_code
0,GB-CHC-1000000,ED
1,GB-CHC-1000000,ED102
2,GB-CHC-1000001,AR
3,GB-CHC-1000001,AR104
4,GB-CHC-1000001,AR201


In [74]:
# splitting the first columns into Island, Country and org number

# the first part of an org identifier represents the Island

# second part represents the country :
# CHC - charities in England and Wales
# NIC - charities in northern ireland
# SC - charities in Scotland

# third part is the unique org no.

classification[['Island', 'Country', 'organisation_number']] = classification['org_id'].str.split('-', expand=True)
classification.head()

Unnamed: 0,org_id,ukcat_code,Island,Country,organisation_number
0,GB-CHC-1000000,ED,GB,CHC,1000000
1,GB-CHC-1000000,ED102,GB,CHC,1000000
2,GB-CHC-1000001,AR,GB,CHC,1000001
3,GB-CHC-1000001,AR104,GB,CHC,1000001
4,GB-CHC-1000001,AR201,GB,CHC,1000001


In [75]:
classification.drop("org_id", axis=1, inplace=True)
new_cols = ['Island','Country','organisation_number','ukcat_code']
classification=classification.reindex(columns=new_cols)
print(classification.shape)
print(classification.head())


(780301, 4)
  Island Country organisation_number ukcat_code
0     GB     CHC             1000000         ED
1     GB     CHC             1000000      ED102
2     GB     CHC             1000001         AR
3     GB     CHC             1000001      AR104
4     GB     CHC             1000001      AR201


In [76]:
Counter(classification['Island'])
# all orgs are inside Great Britain

Counter({'GB': 780301})

In [77]:
Counter(classification['Country'])
# CHC - England and Wales
# NIC - Northern Ireland
# SC - Scotland

Counter({'CHC': 632116, 'NIC': 22850, 'SC': 125335})

In [78]:
# focus on charities in England and Wales
classification_england = classification[classification['Country']=="CHC"]
classification_england.shape

(632116, 4)

In [79]:
classification_england = classification_england.rename(columns={'ukcat_code':'Code'})
classification_england.head()

Unnamed: 0,Island,Country,organisation_number,Code
0,GB,CHC,1000000,ED
1,GB,CHC,1000000,ED102
2,GB,CHC,1000001,AR
3,GB,CHC,1000001,AR104
4,GB,CHC,1000001,AR201


In [80]:
# charity classification website also provides detailed description for all the classification codes
codes = pd.read_csv('drive/My Drive/UK_Data/Classification/ukcat-codes.csv')
codes.head()

Unnamed: 0,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code,Regular expression,Exclude regular expression
0,AN,Animals,Animals,,1,Includes animals without their own category. I...,E21;E20,\b(animals?|cats?|rabbits?|cows?|sheep|pigs?|c...,
1,AN101,Cats,Animals,,3,,,\b(cats?|felines?)\b,
2,AN102,Dogs,Animals,,3,,,\b(dogs?|canines?|kennels?|Rhodesian Ridgeback)\b,
3,AN103,Donkeys,Animals,,3,,E21;E29;E20,\b(donkey|donkeys|mule(s)?)\b,
4,AN104,Horses,Animals,,3,,E21;E29;E20,\b(horses?|equine|dressage|(pon(y|ies)+)|horse...,


In [81]:
# merge the previous data frame with code description dataframe
classification_england = classification_england.merge(codes,on="Code",how="left")
classification_england.head()

Unnamed: 0,Island,Country,organisation_number,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code,Regular expression,Exclude regular expression
0,GB,CHC,1000000,ED,Education,Education,,1,,B90,\b(education(al)?)\b,
1,GB,CHC,1000000,ED102,Further education,Education,,3,,B21,\b(colleges?|further education)\b,
2,GB,CHC,1000001,AR,Arts,Arts,,1,Includes crafts,A10,\b(arts?)\b,
3,GB,CHC,1000001,AR104,Visual arts,Arts,,3,Consider relationship with arts,A11,\b(painting|sculpture|sculpting|crafts|craftin...,
4,GB,CHC,1000001,AR201,Film,Arts,Media and publishing,3,,A11,\b(films?|cinema)\b,


In [82]:
# dropping the columns that won't be required
classification_england.drop(['Regular expression','Exclude regular expression'], axis=1, inplace=True)
classification_england.head()


Unnamed: 0,Island,Country,organisation_number,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code
0,GB,CHC,1000000,ED,Education,Education,,1,,B90
1,GB,CHC,1000000,ED102,Further education,Education,,3,,B21
2,GB,CHC,1000001,AR,Arts,Arts,,1,Includes crafts,A10
3,GB,CHC,1000001,AR104,Visual arts,Arts,,3,Consider relationship with arts,A11
4,GB,CHC,1000001,AR201,Film,Arts,Media and publishing,3,,A11


In [83]:
Counter(classification_england['Category'])

Counter({'Animals': 4965,
         'Armed forces': 2695,
         'Arts': 36935,
         'Associations': 47471,
         'Beneficiary group': 99636,
         'Charitable activities': 34723,
         'Charity and VCS support': 31201,
         'Childcare': 10855,
         'Crime and Justice': 1792,
         'Economic and community development': 10726,
         'Education': 114570,
         'Environment': 2616,
         'Facilities': 18876,
         'Health': 30711,
         'Heritage': 11542,
         'Housing': 13328,
         'Leisure': 26808,
         'Professions': 1138,
         'Religion': 78009,
         'Research': 8795,
         'Saving of lives': 2827,
         'Social care': 3391,
         'Social welfare': 35715,
         'Society': 2791})

In [84]:
# add some more data to the classification dataframe such as no. of trustees, 
# latest income, expenditure and registration status
n_trustees = []
latest_income = []
latest_expenditure = []
registration_status = []
ctr = 0
for i,row in classification_england.iterrows():
  if(ctr%20000==0):
    print(ctr,end=" ")
  id = int(row['organisation_number'])
  try:
    n_trustees.append(len(orgs[id]))
  except:
    n_trustees.append(None)
  try:
    latest_income.append(charity[id]['latest_income'])
    latest_expenditure.append(charity[id]['latest_expenditure'])
    registration_status.append(charity[id]['charity_registration_status'])
  except:
    latest_income.append(0)
    latest_expenditure.append(0)
    registration_status.append("Unknown")
  ctr+=1

0 20000 40000 60000 80000 100000 120000 140000 160000 180000 200000 220000 240000 260000 280000 300000 320000 340000 360000 380000 400000 420000 440000 460000 480000 500000 520000 540000 560000 580000 600000 620000 

In [85]:
# update the dataframe
classification_england['num_trustees'] = n_trustees
classification_england['latest_income'] = latest_income
classification_england['latest_expenditure'] = latest_expenditure
classification_england['registration_status'] = registration_status
classification_england.head()

Unnamed: 0,Island,Country,organisation_number,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code,num_trustees,latest_income,latest_expenditure,registration_status
0,GB,CHC,1000000,ED,Education,Education,,1,,B90,9.0,101992.0,349277.0,Registered
1,GB,CHC,1000000,ED102,Further education,Education,,3,,B21,9.0,101992.0,349277.0,Registered
2,GB,CHC,1000001,AR,Arts,Arts,,1,Includes crafts,A10,11.0,91129.0,131934.0,Registered
3,GB,CHC,1000001,AR104,Visual arts,Arts,,3,Consider relationship with arts,A11,11.0,91129.0,131934.0,Registered
4,GB,CHC,1000001,AR201,Film,Arts,Media and publishing,3,,A11,11.0,91129.0,131934.0,Registered


In [86]:
Counter(classification_england['registration_status'])

Counter({'Registered': 242395, 'Removed': 20, 'Unknown': 389701})

In [87]:
#filtering out charities with "Removed" or "Unknown" registration status from the dataframe
classification_england_filtered = classification_england[classification_england['registration_status'] == 'Registered']
print(classification_england_filtered.shape)

(242395, 14)


In [88]:
#adding a column with each entry 1 so that the nubmber of charities in each category can be counted at the time of aggregation using groupby
rws = classification_england_filtered.shape[0]
nums = [1 for i in range(rws)]
classification_england_filtered['total_organisations'] = nums
classification_england_filtered.head()

Unnamed: 0,Island,Country,organisation_number,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code,num_trustees,latest_income,latest_expenditure,registration_status,total_organisations
0,GB,CHC,1000000,ED,Education,Education,,1,,B90,9.0,101992.0,349277.0,Registered,1
1,GB,CHC,1000000,ED102,Further education,Education,,3,,B21,9.0,101992.0,349277.0,Registered,1
2,GB,CHC,1000001,AR,Arts,Arts,,1,Includes crafts,A10,11.0,91129.0,131934.0,Registered,1
3,GB,CHC,1000001,AR104,Visual arts,Arts,,3,Consider relationship with arts,A11,11.0,91129.0,131934.0,Registered,1
4,GB,CHC,1000001,AR201,Film,Arts,Media and publishing,3,,A11,11.0,91129.0,131934.0,Registered,1


In [89]:
# aggregate all the data into tags
groupedbyTag = classification_england_filtered.groupby('tag').agg({'total_organisations':'sum',
                                                                   'num_trustees':'sum',
                                                                   'latest_income':'sum',
                                                                   'latest_expenditure':'sum'})

groupedbyTag.head()

Unnamed: 0_level_0,total_organisations,num_trustees,latest_income,latest_expenditure
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abuse,131,768.0,140871700.0,130211600.0
Accommodation,2677,15644.0,4641294000.0,4635481000.0
Addiction and dependency,148,970.0,596759400.0,597226900.0
Adult day care,313,1913.0,630637900.0,590020600.0
Adult education,98,705.0,173887000.0,163925000.0


In [90]:
# aggregate all data into categories
groupedbyCategory = classification_england_filtered.groupby('Category').agg({'total_organisations':'sum',
                                                                             'num_trustees':'sum',
                                                                             'latest_income':'sum',
                                                                             'latest_expenditure':'sum'})

groupedbyCategory.head()

Unnamed: 0_level_0,total_organisations,num_trustees,latest_income,latest_expenditure
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Animals,1543,9295.0,1719326000.0,1798192000.0
Armed forces,1158,6978.0,1196735000.0,1289832000.0
Arts,14595,91011.0,8888893000.0,8863788000.0
Associations,25878,131342.0,3094910000.0,3020831000.0
Beneficiary group,36824,194147.0,26710140000.0,26320540000.0


In [56]:
# save the dataframes as CSVs
compression_opts1 = dict(method='zip',archive_name='gropuedbyTag.csv')
compression_opts2 = dict(method='zip',archive_name='gropuedbyCategory.csv')
groupedbyTag.to_csv('groupedbyTag.zip', index=True, compression=compression_opts1)
groupedbyCategory.to_csv('groupedbyCategory.zip', index=True, compression=compression_opts2)

In [57]:
compression_opts = dict(method='zip',
                        archive_name='classification_england_filtered.csv')
classification_england_filtered.to_csv('classification_england_filtered.zip', index=False, compression=compression_opts)

In [58]:
compression_opts = dict(method='zip',
                        archive_name='classification_england_whole.csv')
classification_england.to_csv('classification_england_whole.zip', index=False, compression=compression_opts)

In [91]:
#REMOVING CATEGORY DUPLICATES FOR AN ORGANISATION
# there are rows in the main dataframe where the org no. and category is same but the the tags or sub categories are different
# we need to account for them while groupong by categories as it will produce false results if not taken into account
classification_england_filtered_category = classification_england_filtered.drop_duplicates(subset =["organisation_number","Category"],
                                                                                  keep = "first", inplace = False)
print(classification_england_filtered_category.shape)
classification_england_filtered_category.head()

(173125, 15)


Unnamed: 0,Island,Country,organisation_number,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code,num_trustees,latest_income,latest_expenditure,registration_status,total_organisations
0,GB,CHC,1000000,ED,Education,Education,,1,,B90,9.0,101992.0,349277.0,Registered,1
2,GB,CHC,1000001,AR,Arts,Arts,,1,Includes crafts,A10,11.0,91129.0,131934.0,Registered,1
7,GB,CHC,1000002,BE200,People with disabilities,Beneficiary group,People with disabilities,2,"Includes 'disability', see also separate categ...",D13;G16,2.0,0.0,0.0,Registered,1
8,GB,CHC,1000002,SW105,Individual poverty,Social welfare,,3,Includes financial disadvantage,G11;D19,2.0,0.0,0.0,Registered,1
9,GB,CHC,1000003,AR,Arts,Arts,,1,Includes crafts,A10,4.0,16443.0,25269.0,Registered,1


In [60]:
classification_england_filtered_category.to_csv('classification_england_filtered_RemovedCategoryDuplicates.zip', index=False, compression=compression_opts)

In [92]:
#Grouping by category again after removing the duplicates
groupedbyCategory_filtered = classification_england_filtered_category.groupby('Category').agg({'total_organisations':'sum',
                                                                             'num_trustees':'sum',
                                                                             'latest_income':'sum',
                                                                             'latest_expenditure':'sum'})

groupedbyCategory_filtered.head()

Unnamed: 0_level_0,total_organisations,num_trustees,latest_income,latest_expenditure
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Animals,1098,6773.0,1239507000.0,1271909000.0
Armed forces,845,5014.0,938681400.0,991947200.0
Arts,9058,54968.0,6286894000.0,6269809000.0
Associations,20680,105866.0,2813041000.0,2751299000.0
Beneficiary group,25262,132958.0,15801360000.0,15652300000.0


In [62]:
compression_opts111 = dict(method='zip',archive_name='gropuedbyCategory_filtered.csv')
groupedbyCategory_filtered.to_csv('groupedbyCategory_filtered.zip', index=True, compression=compression_opts111)

In [93]:
#Arranged in decreasing number of total organisations
groupedbyCategory_filtered.sort_values(by=['total_organisations'],ascending=False)


Unnamed: 0_level_0,total_organisations,num_trustees,latest_income,latest_expenditure
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beneficiary group,25262,132958.0,15801360000.0,15652300000.0
Education,24937,133785.0,18634560000.0,18193860000.0
Associations,20680,105866.0,2813041000.0,2751299000.0
Charity and VCS support,11794,54377.0,4006932000.0,4007274000.0
Religion,11426,58041.0,5765834000.0,5555539000.0
Charitable activities,11046,58894.0,7683247000.0,8632737000.0
Facilities,10499,57487.0,779332800.0,737069300.0
Social welfare,9857,48476.0,4755827000.0,4814662000.0
Arts,9058,54968.0,6286894000.0,6269809000.0
Leisure,9046,48137.0,2001227000.0,2001660000.0


We have 24 different categories with highest number of organisations in the 'Beneficiary Group' category followed by Education, Associations, Charity and VCS support etc.

### Classification by charityclassification.org.uk - ICNPTSO - ML Classifier

This is something that can be explored in future if required

### Creating Clean Dataset in Graph Object Form

In [94]:
import networkx as nx

In [95]:
classification_england_filtered_category.reset_index(drop=True,inplace=True)
print(classification_england_filtered_category.shape)
classification_england_filtered_category.head(5)

(173125, 15)


Unnamed: 0,Island,Country,organisation_number,Code,tag,Category,Subcategory,Level,Notes,Related ICNPTSO code,num_trustees,latest_income,latest_expenditure,registration_status,total_organisations
0,GB,CHC,1000000,ED,Education,Education,,1,,B90,9.0,101992.0,349277.0,Registered,1
1,GB,CHC,1000001,AR,Arts,Arts,,1,Includes crafts,A10,11.0,91129.0,131934.0,Registered,1
2,GB,CHC,1000002,BE200,People with disabilities,Beneficiary group,People with disabilities,2,"Includes 'disability', see also separate categ...",D13;G16,2.0,0.0,0.0,Registered,1
3,GB,CHC,1000002,SW105,Individual poverty,Social welfare,,3,Includes financial disadvantage,G11;D19,2.0,0.0,0.0,Registered,1
4,GB,CHC,1000003,AR,Arts,Arts,,1,Includes crafts,A10,4.0,16443.0,25269.0,Registered,1


In [96]:
#categorywise classification, no category duplicates
# `classification_england_filtered_category` includes the individual organisation data classified into unique categories
# we are not concerned with tags or sub categories right now
# it is still possible that a single organiastion is classified into multiple categories and we want to all of those in the final classification
final_classification = {}
for i,row in classification_england_filtered_category.iterrows():
  nn = row['organisation_number']
  if(nn not in final_classification.keys()):
    final_classification[nn] = [row['Category']]
  else:
    final_classification[nn].append(row['Category']) # store all the categories inside an array

# the classification categories for an organisation will be accessible by the org number


In [97]:
fc=0
for i in final_classification.items():
  print(i)
  fc+=1
  if fc==10:
    break

('1000000', ['Education'])
('1000001', ['Arts'])
('1000002', ['Beneficiary group', 'Social welfare'])
('1000003', ['Arts', 'Charitable activities', 'Education', 'Heritage', 'Social welfare'])
('1000005', ['Facilities', 'Religion'])
('1000007', ['Associations', 'Charitable activities', 'Education'])
('1000008', ['Associations'])
('1000009', ['Charity and VCS support'])
('1000013', ['Beneficiary group', 'Charity and VCS support', 'Health'])
('1000019', ['Associations', 'Facilities', 'Leisure'])


In [106]:
# fetch all metadata of a particular charity if its registration status is - 'registered'
# they keys in the below dictionary are registered charity numbers and not the organisation number
final_charity = {}
for i in charity.keys():
  if(charity[i]['charity_registration_status']=="Registered"):
    # id = charity[i]['registered_charity_number']
    final_charity[i] = charity[i]
print(len(final_charity.keys()))
print(json.dumps(list(final_charity.items())[0],indent=3))

185736
[
   2,
   {
      "date_of_extract": "2022-01-11T00:00:00",
      "organisation_number": 2,
      "registered_charity_number": 200027,
      "linked_charity_number": 2,
      "charity_name": "HITCHAM FREE CHURCH",
      "charity_type": null,
      "charity_registration_status": "Registered",
      "date_of_registration": "1962-05-17T00:00:00",
      "date_of_removal": null,
      "charity_reporting_status": null,
      "latest_acc_fin_period_start_date": null,
      "latest_acc_fin_period_end_date": null,
      "latest_income": null,
      "latest_expenditure": null,
      "charity_contact_address1": null,
      "charity_contact_address2": null,
      "charity_contact_address3": null,
      "charity_contact_address4": null,
      "charity_contact_address5": null,
      "charity_contact_postcode": null,
      "charity_contact_phone": null,
      "charity_contact_email": null,
      "charity_contact_web": null,
      "charity_company_registration_number": null,
      "charity_ins

In [107]:
# check if classification of all the charities is available in the dataset from charityclassification.org
ne = []
for i in final_charity.keys():
  try:
    temp = final_classification[str(i)]
  except:
    ne.append(i)

print(len(final_charity.keys()))
print(len(ne))

185736
117681


- 117681 organisation seem to have no classification data from charityclassification.org
- 185736 - 117681 = 68055 charities have classifications
- need to figure it out, why are the classifications missing? 

In [110]:
# we need no. of trustees and trustee list for every organisation
# trustees are added in a list for corresponsing charity
orgs_trustees = {}
for i in orgs:
  for j in orgs[i]:
    if(i not in orgs_trustees.keys()):
      orgs_trustees[i] = [(j['trustee_id'],j['trustee_name'])]
    else:
      orgs_trustees[i].append((j['trustee_id'],j['trustee_name']))

list(orgs_trustees.items())[0]

(521013, [(23760, 'AUGHTON PARISH COUNCIL')])

In [117]:
# we need number of orgs and org list for every trustee
# orgs are added into a list for every trustee
trustee_orgs = {}
for i in members:
  for j in members[i]:
    id = j['organisation_number']
    if i not in trustee_orgs.keys():
      trustee_orgs[i] = [(id,charity[id]['charity_name'])]
    else:
      trustee_orgs[i].append((id,charity[id]['charity_name']))

list(trustee_orgs.items())[0]

(23760, [(521013, 'RACHEL TAYLOR FOR A PLAYING FIELD')])

In [111]:
trusteeList = members.keys()
orgList = orgs.keys()
print(len(trusteeList),len(set(trusteeList)))
print(len(orgList),len(set(orgList)))

851555 851555
170190 170190


In [112]:
common = set(trusteeList).intersection(set(orgList))
print(len(common))

1073


This might create conflict since we are adding nodes to the network by their ids and if trustees and orgs share some common ids then it will create conflicts and data will be lost.

In [130]:
#createing clean graph object dataset
G = nx.MultiGraph()
count = 0 
for i in orgs:
  count+=1
  G.add_node(i)
  # copy metadata
  attribute_dict = final_charity[i].copy()
  try:
    attribute_dict['Classification'] = final_classification[str(i)]
  except:
    # add an empty list if no classification is found
    attribute_dict['Classification'] = []
  
  # add no. of trustees
  attribute_dict['num_trustee'] = len(orgs[i])
  attribute_dict['type'] = 0 #denoting the node type as organisation

  # add trustee list as attribute to the org node
  attribute_dict['trustees'] = orgs_trustees[i]

  # node id is the org number
  # setting attributes for org i
  attrs = {i:attribute_dict}
  nx.set_node_attributes(G, attrs)

  # add trustee data
  for j in orgs_trustees[i]:
    #if its a common id
    if(j[0] in common):
      # add T at the end of the id so that it can be added as a separate node
      node_id = str(j[0])+'T'
    else:
      node_id = j[0]
    # add node to the network
    G.add_node(node_id)
    nattrs = {node_id:{"Name":j[1],"type":1,"organisations":trustee_orgs[j[0]]}} #1 denotes node type as trustee
    nx.set_node_attributes(G, nattrs)
  if(count%10000==0):
    print(count,end=" ")

10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 

In [131]:
G.number_of_nodes()

1021745

In [132]:
#Adding edges
for p in members.keys():
  # modify the node id if its a common id
  if p in common:
    node_id = str(p)+'T'
  else:
    node_id = p
  if(node_id in G.nodes(0)):
    boards = []
    for j in members[p]:
      if(j['organisation_number'] in G.nodes()):
        boards.append(j['organisation_number'])
    res = [ (node_id,sub) for sub in boards ]
    G.add_edges_from(res)

In [135]:
G.nodes[23760]

{'Name': 'AUGHTON PARISH COUNCIL',
 'organisations': [(521013, 'RACHEL TAYLOR FOR A PLAYING FIELD')],
 'type': 1}

In [136]:
G.nodes[3123875]

{'Classification': [],
 'charity_activities': 'Making of grants.',
 'charity_company_registration_number': None,
 'charity_contact_address1': 'RUSHMOOR BOROUGH COUNCIL',
 'charity_contact_address2': 'COUNCIL OFFICES',
 'charity_contact_address3': 'FARNBOROUGH ROAD',
 'charity_contact_address4': 'FARNBOROUGH',
 'charity_contact_address5': None,
 'charity_contact_email': 'legal@rushmoor.gov.uk',
 'charity_contact_phone': '01252398600',
 'charity_contact_postcode': 'GU14 7JU',
 'charity_contact_web': None,
 'charity_gift_aid': False,
 'charity_has_land': False,
 'charity_in_administration': False,
 'charity_insolvent': False,
 'charity_is_cdf_or_cif': None,
 'charity_is_cio': False,
 'charity_name': 'ALFRED HENRY GOODE WILL TRUST',
 'charity_previously_excepted': False,
 'charity_registration_status': 'Registered',
 'charity_reporting_status': 'Submission Overdue',
 'charity_type': 'Other',
 'cio_is_dissolved': False,
 'date_cio_dissolution_notice': None,
 'date_of_extract': '2022-01-11T0

In [137]:
import pickle
# export the graph data to a pickle file
with open("dataset.pickle", 'wb') as f:
    pickle.dump(G, f)


In [138]:
import pickle
# this is how data can be loaded from the saved pickle file for future use
# note that the data will be loaded as a networkx multigraph since that was the format it was initially exported in
data = pickle.load(open("dataset.pickle", "rb"))

In [142]:
type(data)

networkx.classes.multigraph.MultiGraph