-*- coding: utf-8 -*-

**Created on Thu Aug 23, 2020**

**@author:** Rohitashwa

**@Credits:** loren ( https://github.com/jjeffus/gofundme)

# Initialise

In [13]:
import numpy as np
import pandas as pd
import time
import datetime

In [5]:
%store -z

no stored variable or alias databaseName


In [14]:
%run ./Utilities/ScrapeUtils.ipynb

Stored 'headers' (list)


## Scrape HomePage to identify all fundraiser wepage URLs

**NOTE:**
The following categories' webpage URLs do not follow the format specifies in the code:
- Animals: 
    Url: ~animals-fundraiser~ -> animal-fundraiser
    
- NonProfits:
    Url: ~nonprofits-fundraiser~ -> charity-fundraiser
    
**RESCRAPE:**
The following categories need to be rescraped due to insufficient (<1,000) fundraiser count:

In [15]:
all_cats = fetch_category_URLs()

all_cats.remove('Nonprofit')
all_cats.remove('Animals')
all_cats.extend(['Charity','Animal'])
# all_cats

In [16]:
#make url_categories
categories_urls = list(map(lambda x:{x:'https://www.gofundme.com/discover/{}-fundraiser'.format(x.lower())}, all_cats))
categories_urls

[{'Medical': 'https://www.gofundme.com/discover/medical-fundraiser'},
 {'Memorial': 'https://www.gofundme.com/discover/memorial-fundraiser'},
 {'Emergency': 'https://www.gofundme.com/discover/emergency-fundraiser'},
 {'Education': 'https://www.gofundme.com/discover/education-fundraiser'},
 {'Environment': 'https://www.gofundme.com/discover/environment-fundraiser'},
 {'Business': 'https://www.gofundme.com/discover/business-fundraiser'},
 {'Community': 'https://www.gofundme.com/discover/community-fundraiser'},
 {'Competition': 'https://www.gofundme.com/discover/competition-fundraiser'},
 {'Creative': 'https://www.gofundme.com/discover/creative-fundraiser'},
 {'Event': 'https://www.gofundme.com/discover/event-fundraiser'},
 {'Faith': 'https://www.gofundme.com/discover/faith-fundraiser'},
 {'Family': 'https://www.gofundme.com/discover/family-fundraiser'},
 {'Sports': 'https://www.gofundme.com/discover/sports-fundraiser'},
 {'Travel': 'https://www.gofundme.com/discover/travel-fundraiser'},


## Fetch All Fundraiser URLs from category homepage

In [17]:
def list_urls(params):
    print(params)
    categoryURLs = params[0]
    moreGFMclicks = params[1]
    GFM_urls = []
    for url_pair in categoryURLs:
        category = url_pair
        url = categoryURLs[url_pair]
        print("Currently Scraping: "+url)
        t_init = time.time()
        GFM_urls.append([extract_urls_from_categories(url, MoreGFMclicks = moreGFMclicks), category])#get category from categories_urls
        t_fin = time.time() - t_init
    print("All done for "+category+" category in "+str(t_fin)+" m-sec!\n")
    return(GFM_urls)

## Sequential Mode

In [18]:
# #set number of click limit (max 90)
# clickLimit = 0
# return_value = [(list_urls((item,clickLimit))) for item in categories_urls]

# # flatten list

# GFM_Urls_long = []
# for k in return_value:
#         GFM_Urls_long.extend(k)

## Multithreaded Mode

In [19]:
import concurrent.futures

#set number of click limit (max 90)
clickLimit = 2
GFM_Urls_long = []

t0= time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(list_urls, (item,clickLimit)) for item in categories_urls]
    return_value = [f.result() for f in futures]

    rowCount = 1
    for k in return_value:
        GFM_Urls_long.extend(k)
#         rowCount = rowCount + batch_size

t1= time.time() -t0
print("\nTime-Elapsed total: "+str(t1))
print(len(GFM_Urls_long))
# GFM_Urls_long

({'Medical': 'https://www.gofundme.com/discover/medical-fundraiser'}, 2)
Currently Scraping: https://www.gofundme.com/discover/medical-fundraiser
({'Memorial': 'https://www.gofundme.com/discover/memorial-fundraiser'}, 2)
Currently Scraping: https://www.gofundme.com/discover/memorial-fundraiser
({'Emergency': 'https://www.gofundme.com/discover/emergency-fundraiser'}, 2)
({'Education': 'https://www.gofundme.com/discover/education-fundraiser'}, 2)
Currently Scraping: https://www.gofundme.com/discover/education-fundraiser
Currently Scraping: https://www.gofundme.com/discover/emergency-fundraiser({'Environment': 'https://www.gofundme.com/discover/environment-fundraiser'}, 2)
Currently Scraping: https://www.gofundme.com/discover/environment-fundraiser

({'Business': 'https://www.gofundme.com/discover/business-fundraiser'}, 2)
Currently Scraping: https://www.gofundme.com/discover/business-fundraiser
({'Community': 'https://www.gofundme.com/discover/community-fundraiser'}, 2)
Currently Scrapin

## Flatten Result List

In [20]:
# GFM_Urls_long

In [21]:
mydf = pd.DataFrame(columns = ["Url", "Category","Position"])
for cat in GFM_Urls_long:
    
    temp_val = np.array(list(cat[0].values()))
    temp_key = np.array(list(cat[0].keys()))
    temp_category = np.repeat(cat[1], len(cat[0]))
    
    temp_df = pd.DataFrame(columns = ["Url", "Category", "Position"])
    temp_df["Position"] = temp_val
    temp_df["Category"] = temp_category
    temp_df["Url"] = temp_key
    
    mydf = mydf.append(temp_df, ignore_index = True)
mydf.Category.value_counts()

Creative       36
Community      24
Memorial       24
Animal         24
Education      24
Event          12
Sports         12
Business       12
Wishes         12
Medical        12
Faith          12
Environment    12
Family         12
Charity        12
Volunteer      12
Emergency      12
Competition    12
Name: Category, dtype: int64

## Save to Mongo

In [25]:
from pymongo import MongoClient
import json

databaseName = "GFM_url_list_{0}".format(datetime.datetime.now().strftime("%x").replace("/","_"))

client = MongoClient('mongodb://localhost:27017')
table = client.GFM[databaseName]

%store databaseName

Stored 'databaseName' (str)


In [26]:
for index,record in mydf.iterrows():
    table.insert_one(record.to_dict())

Inserting: 0
Inserting: 1
Inserting: 2
Inserting: 3
Inserting: 4
Inserting: 5
Inserting: 6
Inserting: 7
Inserting: 8
Inserting: 9
Inserting: 10
Inserting: 11
Inserting: 12
Inserting: 13
Inserting: 14
Inserting: 15
Inserting: 16
Inserting: 17
Inserting: 18
Inserting: 19
Inserting: 20
Inserting: 21
Inserting: 22
Inserting: 23
Inserting: 24
Inserting: 25
Inserting: 26
Inserting: 27
Inserting: 28
Inserting: 29
Inserting: 30
Inserting: 31
Inserting: 32
Inserting: 33
Inserting: 34
Inserting: 35
Inserting: 36
Inserting: 37
Inserting: 38
Inserting: 39
Inserting: 40
Inserting: 41
Inserting: 42
Inserting: 43
Inserting: 44
Inserting: 45
Inserting: 46
Inserting: 47
Inserting: 48
Inserting: 49
Inserting: 50
Inserting: 51
Inserting: 52
Inserting: 53
Inserting: 54
Inserting: 55
Inserting: 56
Inserting: 57
Inserting: 58
Inserting: 59
Inserting: 60
Inserting: 61
Inserting: 62
Inserting: 63
Inserting: 64
Inserting: 65
Inserting: 66
Inserting: 67
Inserting: 68
Inserting: 69
Inserting: 70
Inserting: 71
In

## Save to file

In [None]:
fileName = 'data/{0}.csv'.format(databaseName)
mydf.to_csv(fileName, sep='\t', index = False, header = False, mode = 'a')

In [None]:
mydf = pd.read_csv(fileName, sep='\t')
mydf

**NOTE:**
The data is split over the following files:
- data/GFM_url_7k.csv(Scraped; uploaded)
- data/GFM_url2_3k.csv
- data/GFM_url3_2k.csv

## Combining all CSVs 

In [None]:
# import pandas as pd
# import numpy as np

In [None]:
# fileList = [
#     "data/GFM_url_business_1k.csv",
#     "data/GFM_url_emergency_1k.csv",
#     "data/GFM_url_medical_1k.csv",
#     "data/GFM_url_memorialEducation_1k.csv",
#     'data/GFM_url_volunteer_1k.csv',
#     "data/GFM_url_competition_1k.csv"
# ]

In [None]:
# final = pd.DataFrame()
# for item in fileList:
#     x = pd.read_csv(item,sep = '\t')
#     print(len(x))
#     final = pd.concat([final,x], ignore_index = True)
    
# final.drop(['Unnamed: 0'],axis = 1, inplace = True)
# final.tail()

In [None]:
# final.to_csv('data/GFM_url_compiled.csv', sep='\t', index = False)

In [None]:
# pd.read_csv('data/GFM_url_compiled.csv', sep='\t')