-*- coding: utf-8 -*-

**Created on Thu Aug 23, 2020**

**@author:** Rohitashwa

**@Credits:** loren ( https://github.com/jjeffus/gofundme)

# Initialise

In [1]:
import numpy as np
import pandas as pd
import time
import datetime

In [2]:
%store -z

In [3]:
%run ../Utilities/ScrapeUtils.ipynb

Stored 'headers' (list)


## Scrape HomePage to identify all fundraiser wepage URLs

**NOTE:**
The following categories' webpage URLs do not follow the format specifies in the code:
- Animals: 
    Url: ~animals-fundraiser~ -> animal-fundraiser
    
- NonProfits:
    Url: ~nonprofits-fundraiser~ -> charity-fundraiser
    
**RESCRAPE:**
The following categories need to be rescraped due to insufficient (<1,000) fundraiser count:

In [4]:
all_cats = fetch_category_URLs()

all_cats.remove('Nonprofit')
all_cats.remove('Animals')
all_cats.extend(['Charity','Animal'])
# all_cats

In [5]:
#make url_categories
categories_urls = list(map(lambda x:{x:'https://www.gofundme.com/discover/{}-fundraiser'.format(x.lower())}, all_cats))
categories_urls

[{'Medical': 'https://www.gofundme.com/discover/medical-fundraiser'},
 {'Memorial': 'https://www.gofundme.com/discover/memorial-fundraiser'},
 {'Emergency': 'https://www.gofundme.com/discover/emergency-fundraiser'},
 {'Education': 'https://www.gofundme.com/discover/education-fundraiser'},
 {'Environment': 'https://www.gofundme.com/discover/environment-fundraiser'},
 {'Business': 'https://www.gofundme.com/discover/business-fundraiser'},
 {'Community': 'https://www.gofundme.com/discover/community-fundraiser'},
 {'Competition': 'https://www.gofundme.com/discover/competition-fundraiser'},
 {'Creative': 'https://www.gofundme.com/discover/creative-fundraiser'},
 {'Event': 'https://www.gofundme.com/discover/event-fundraiser'},
 {'Faith': 'https://www.gofundme.com/discover/faith-fundraiser'},
 {'Family': 'https://www.gofundme.com/discover/family-fundraiser'},
 {'Sports': 'https://www.gofundme.com/discover/sports-fundraiser'},
 {'Travel': 'https://www.gofundme.com/discover/travel-fundraiser'},


## Fetch All Fundraiser URLs from category homepage

In [6]:
def list_urls(params):
    print(params)
    categoryURLs = params[0]
    moreGFMclicks = params[1]
    GFM_urls = []
    for url_pair in categoryURLs:
        category = url_pair
        url = categoryURLs[url_pair]
        print("Currently Scraping: "+url)
        t_init = time.time()
        GFM_urls.append([extract_urls_from_categories(url, MoreGFMclicks = moreGFMclicks), category])#get category from categories_urls
        t_fin = time.time() - t_init
    print("All done for "+category+" category in "+str(t_fin)+" m-sec!\n")
    return(GFM_urls)

## Sequential Mode

In [7]:
# #set number of click limit (max 90)
# clickLimit = 0
# return_value = [(list_urls((item,clickLimit))) for item in categories_urls]

# # flatten list

# GFM_Urls_long = []
# for k in return_value:
#         GFM_Urls_long.extend(k)

## Multithreaded Mode

In [8]:
import concurrent.futures

#set number of click limit (max 90)
clickLimit = 90
GFM_Urls_long = []

t0= time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    futures = [executor.submit(list_urls, (item,clickLimit)) for item in categories_urls]
    return_value = [f.result() for f in futures]

    rowCount = 1
    for k in return_value:
        GFM_Urls_long.extend(k)
#         rowCount = rowCount + batch_size

t1= time.time() -t0
print("\nTime-Elapsed total: "+str(t1))
print(len(GFM_Urls_long))
# GFM_Urls_long

({'Medical': 'https://www.gofundme.com/discover/medical-fundraiser'}, 90)({'Memorial': 'https://www.gofundme.com/discover/memorial-fundraiser'}, 90)

Currently Scraping: https://www.gofundme.com/discover/memorial-fundraiserCurrently Scraping: https://www.gofundme.com/discover/medical-fundraiser

memorial-fundraiser: Succesful click 1
medical-fundraiser: Succesful click 1
memorial-fundraiser: Succesful click 2
medical-fundraiser: Succesful click 2
memorial-fundraiser: Succesful click 3
medical-fundraiser: Succesful click 3
memorial-fundraiser: Succesful click 4
medical-fundraiser: Succesful click 4
memorial-fundraiser: Succesful click 5
medical-fundraiser: Succesful click 5
memorial-fundraiser: Succesful click 6
medical-fundraiser: Succesful click 6
memorial-fundraiser: Succesful click 7
medical-fundraiser: Succesful click 7
memorial-fundraiser: Succesful click 8
medical-fundraiser: Succesful click 8
memorial-fundraiser: Succesful click 9
medical-fundraiser: Succesful click 9
memorial-f

education-fundraiser: Succesful click 11
emergency-fundraiser: Succesful click 12
education-fundraiser: Succesful click 12
emergency-fundraiser: Succesful click 13
education-fundraiser: Succesful click 13
emergency-fundraiser: Succesful click 14
education-fundraiser: Succesful click 14
emergency-fundraiser: Succesful click 15
education-fundraiser: Succesful click 15
emergency-fundraiser: Succesful click 16
education-fundraiser: Succesful click 16
emergency-fundraiser: Succesful click 17
education-fundraiser: Succesful click 17
emergency-fundraiser: Succesful click 18
education-fundraiser: Succesful click 18
emergency-fundraiser: Succesful click 19
education-fundraiser: Succesful click 19
emergency-fundraiser: Succesful click 20
education-fundraiser: Succesful click 20
emergency-fundraiser: Succesful click 21
education-fundraiser: Succesful click 21
emergency-fundraiser: Succesful click 22
education-fundraiser: Succesful click 22
emergency-fundraiser: Succesful click 23
education-fundra

business-fundraiser: Succesful click 17
environment-fundraiser: Succesful click 27
business-fundraiser: Succesful click 18
environment-fundraiser: Succesful click 28
business-fundraiser: Succesful click 19
environment-fundraiser: Succesful click 29
business-fundraiser: Succesful click 20
environment-fundraiser: Succesful click 30
business-fundraiser: Succesful click 21
environment-fundraiser: Succesful click 31
business-fundraiser: Succesful click 22
environment-fundraiser: Succesful click 32
business-fundraiser: Succesful click 23
environment-fundraiser: Succesful click 33
business-fundraiser: Succesful click 24
business-fundraiser: Succesful click 25
environment-fundraiser: Succesful click 34
business-fundraiser: Succesful click 26
environment-fundraiser: Succesful click 35
business-fundraiser: Succesful click 27
environment-fundraiser: Succesful click 36
business-fundraiser: Succesful click 28
environment-fundraiser: Succesful click 37
business-fundraiser: Succesful click 29
environ

community-fundraiser: Succesful click 43
competition-fundraiser: Succesful click 20
community-fundraiser: Succesful click 44
competition-fundraiser: Succesful click 21
community-fundraiser: Succesful click 45
competition-fundraiser: Succesful click 22
competition-fundraiser: Succesful click 23
community-fundraiser: Succesful click 46
competition-fundraiser: Succesful click 24
competition-fundraiser: Succesful click 25
community-fundraiser: Succesful click 47
competition-fundraiser: Succesful click 26
community-fundraiser: Succesful click 48
competition-fundraiser: Succesful click 27
competition-fundraiser: Succesful click 28
community-fundraiser: Succesful click 49
competition-fundraiser: Succesful click 29
competition-fundraiser: Succesful click 30
community-fundraiser: Succesful click 50
competition-fundraiser: Succesful click 31
community-fundraiser: Succesful click 51
competition-fundraiser: Succesful click 32
competition-fundraiser: Succesful click 33
community-fundraiser: Succesf

creative-fundraiser: Succesful click 53
event-fundraiser: Succesful click 33
creative-fundraiser: Succesful click 54
event-fundraiser: Succesful click 34
event-fundraiser: Succesful click 35
creative-fundraiser: Succesful click 55
event-fundraiser: Succesful click 36
event-fundraiser: Succesful click 37
creative-fundraiser: Succesful click 56
event-fundraiser: Succesful click 38
creative-fundraiser: Succesful click 57
event-fundraiser: Succesful click 39
creative-fundraiser: Succesful click 58
event-fundraiser: Succesful click 40
event-fundraiser: Succesful click 41
creative-fundraiser: Succesful click 59
event-fundraiser: Succesful click 42
event-fundraiser: Succesful click 43
creative-fundraiser: Succesful click 60
event-fundraiser: Succesful click 44
creative-fundraiser: Succesful click 61
event-fundraiser: Succesful click 45
creative-fundraiser: Succesful click 62
event-fundraiser: Succesful click 46
creative-fundraiser: Succesful click 63
event-fundraiser: Succesful click 47
event

family-fundraiser: Succesful click 53
family-fundraiser: Succesful click 54
faith-fundraiser: Succesful click 72
family-fundraiser: Succesful click 55
faith-fundraiser: Succesful click 73
family-fundraiser: Succesful click 56
faith-fundraiser: Succesful click 74
family-fundraiser: Succesful click 57
faith-fundraiser: Succesful click 75
family-fundraiser: Succesful click 58
faith-fundraiser: Succesful click 76
family-fundraiser: Succesful click 59
faith-fundraiser: Succesful click 77
family-fundraiser: Succesful click 60
faith-fundraiser: Succesful click 78
family-fundraiser: Succesful click 61
family-fundraiser: Succesful click 62
faith-fundraiser: Succesful click 79
family-fundraiser: Succesful click 63
faith-fundraiser: Succesful click 80
family-fundraiser: Succesful click 64
faith-fundraiser: Succesful click 81
family-fundraiser: Succesful click 65
faith-fundraiser: Succesful click 82
family-fundraiser: Succesful click 66
faith-fundraiser: Succesful click 83
family-fundraiser: Succe

volunteer-fundraiser: Succesful click 5
volunteer-fundraiser: Succesful click 6
travel-fundraiser: Succesful click 67
volunteer-fundraiser: Succesful click 7
volunteer-fundraiser: Succesful click 8
volunteer-fundraiser: Succesful click 9
travel-fundraiser: Succesful click 68
volunteer-fundraiser: Succesful click 10
volunteer-fundraiser: Succesful click 11
travel-fundraiser: Succesful click 69
volunteer-fundraiser: Succesful click 12
volunteer-fundraiser: Succesful click 13
travel-fundraiser: Succesful click 70
volunteer-fundraiser: Succesful click 14
volunteer-fundraiser: Succesful click 15
volunteer-fundraiser: Succesful click 16
travel-fundraiser: Succesful click 71
volunteer-fundraiser: Succesful click 17
volunteer-fundraiser: Succesful click 18
volunteer-fundraiser: Succesful click 19
travel-fundraiser: Succesful click 72
volunteer-fundraiser: Succesful click 20
volunteer-fundraiser: Succesful click 21
volunteer-fundraiser: Succesful click 22
travel-fundraiser: Succesful click 73
v

charity-fundraiser: Succesful click 28
wishes-fundraiser: Succesful click 74
charity-fundraiser: Succesful click 29
charity-fundraiser: Succesful click 30
wishes-fundraiser: Succesful click 75
charity-fundraiser: Succesful click 31
charity-fundraiser: Succesful click 32
charity-fundraiser: Succesful click 33
wishes-fundraiser: Succesful click 76
charity-fundraiser: Succesful click 34
charity-fundraiser: Succesful click 35
wishes-fundraiser: Succesful click 77
charity-fundraiser: Succesful click 36
charity-fundraiser: Succesful click 37
charity-fundraiser: Succesful click 38
wishes-fundraiser: Succesful click 78
charity-fundraiser: Succesful click 39
charity-fundraiser: Succesful click 40
wishes-fundraiser: Succesful click 79
charity-fundraiser: Succesful click 41
charity-fundraiser: Succesful click 42
wishes-fundraiser: Succesful click 80
charity-fundraiser: Succesful click 43
charity-fundraiser: Succesful click 44
charity-fundraiser: Succesful click 45
wishes-fundraiser: Succesful cli

## Flatten Result List

In [9]:
# GFM_Urls_long

In [10]:
mydf = pd.DataFrame(columns = ["Url", "Category","Position"])
for cat in GFM_Urls_long:
    
    temp_val = np.array(list(cat[0].values()))
    temp_key = np.array(list(cat[0].keys()))
    temp_category = np.repeat(cat[1], len(cat[0]))
    
    temp_df = pd.DataFrame(columns = ["Url", "Category", "Position"])
    temp_df["Position"] = temp_val
    temp_df["Category"] = temp_category
    temp_df["Url"] = temp_key
    
    mydf = mydf.append(temp_df, ignore_index = True)
mydf.Category.value_counts()

Emergency      1000
Family         1000
Charity        1000
Business       1000
Medical        1000
Animal         1000
Faith          1000
Memorial       1000
Community      1000
Travel         1000
Volunteer      1000
Wishes         1000
Environment    1000
Competition    1000
Sports         1000
Event          1000
Creative       1000
Education      1000
Name: Category, dtype: int64

## Save to Mongo

In [11]:
from pymongo import MongoClient
import json

databaseName = "GFM_url_list_{0}".format(datetime.datetime.now().strftime("%x").replace("/","_"))

client = MongoClient('mongodb://localhost:27017')
table = client.GFM[databaseName]

%store databaseName
databaseName

Stored 'databaseName' (str)


'GFM_url_list_12_27_20'

In [12]:
for index,record in mydf.iterrows():
    table.insert_one(record.to_dict())

## Save to file

In [13]:
fileName = 'data/{0}.csv'.format(databaseName)
mydf.to_csv(fileName, sep='\t', index = False, header = False, mode = 'a')

In [14]:
mydf = pd.read_csv(fileName, sep='\t')
mydf

Unnamed: 0,https://www.gofundme.com/f/28b1bckz6o,Medical,0
0,https://www.gofundme.com/f/face-and-acne-treat...,Medical,0
1,https://www.gofundme.com/f/donation-for-uterus...,Medical,0
2,https://www.gofundme.com/f/money-to-buy-hearin...,Medical,1
3,https://www.gofundme.com/f/ht4vqf-eye-surgery,Medical,1
4,https://www.gofundme.com/f/tqtuw-self-employment,Medical,1
...,...,...,...
17994,https://www.gofundme.com/f/oliverslittletinyba...,Animal,331
17995,https://www.gofundme.com/f/wk5vm-el-hogar-halo...,Animal,332
17996,https://www.gofundme.com/f/help-support-lucys-...,Animal,332
17997,https://www.gofundme.com/f/help-rebuild-dark-h...,Animal,332


**NOTE:**
The data is split over the following files:
- data/GFM_url_7k.csv(Scraped; uploaded)
- data/GFM_url2_3k.csv
- data/GFM_url3_2k.csv

## Combining all CSVs 

In [None]:
# import pandas as pd
# import numpy as np

In [None]:
# fileList = [
#     "data/GFM_url_business_1k.csv",
#     "data/GFM_url_emergency_1k.csv",
#     "data/GFM_url_medical_1k.csv",
#     "data/GFM_url_memorialEducation_1k.csv",
#     'data/GFM_url_volunteer_1k.csv',
#     "data/GFM_url_competition_1k.csv"
# ]

In [None]:
# final = pd.DataFrame()
# for item in fileList:
#     x = pd.read_csv(item,sep = '\t')
#     print(len(x))
#     final = pd.concat([final,x], ignore_index = True)
    
# final.drop(['Unnamed: 0'],axis = 1, inplace = True)
# final.tail()

In [None]:
# final.to_csv('data/GFM_url_compiled.csv', sep='\t', index = False)

In [None]:
# pd.read_csv('data/GFM_url_compiled.csv', sep='\t')