In [1]:
import logging
import os
import pandas as pd
import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from googlesearch import search
import time
from crochet import setup

logging.getLogger('scrapy').propagate = False

In [2]:
#import burlingame_accounts.csv and covert to list
burlingame_accounts = pd.read_csv('data/burlingame_accounts.csv')
burlingame_accounts = burlingame_accounts['searchterm'].tolist()

In [3]:
def get_urls(tag, n, language):
    urls = [url for url in search(query=tag, stop=n, lang=language)][:n]
    return urls

In [4]:
#get_urls('beer san francisco', 5 , 'en')

In [5]:
class MailSpider(scrapy.Spider):
    
    name = 'email'
    
    def parse(self, response):
        
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))
        
        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link) 
            
    def parse_link(self, response):
        
        for word in self.reject:
            if word in str(response.url):
                return
            
        html_text = str(response.text)
        #mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)
        #mail_list = re.findall('\w{3,}@\w{3,}', html_text)
        mail_list = re.findall('[a-zA-Z0-9_.+-]{3,}@[a-zA-Z0-9-]{3,}\.[a-zA-Z0-9-.]{2,}', html_text)
        

        dic = {'email': mail_list, 'link': str(response.url)}
        df = pd.DataFrame(dic)
        
        df.to_csv(self.path, mode='a', header=False)
        df.to_csv(self.path, mode='a', header=False)

In [6]:
def ask_user(question):
    response = input(question + ' y/n' + '\n')
    if response == 'y':
        return True
    else:
        return False

def create_file(path):
    response = False
    if os.path.exists(path):
        response = ask_user('File already exists, replace?')
        if response == False: return 
    
    with open(path, 'wb') as file: 
        file.close()

def append_file(path):
    response = False
    if os.path.exists(path):
        response = ask_user('File already exists, append?')
        if response == False: return 
    
    with open(path, 'ab') as file: 
        file.close()

In [7]:
setup()

def get_info(tag, n, language, path, reject=[]):
    
    #create_file(path)
    df = pd.DataFrame(columns=['email', 'link'], index=[0])
    df.to_csv(path, mode='a', header=True)
    
    print('Collecting Google urls...')
    google_urls = get_urls(tag, n, language)
    
    print('Searching for emails...')
    #process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
    process = CrawlerRunner({'USER_AGENT': 'Mozilla/5.0'})
    process.crawl(MailSpider, start_urls=google_urls, path=path, reject=reject)
    #process.start()


    print('Cleaning emails...')
    df = pd.read_csv(path, index_col=0)
    df.columns = ['email', 'link']
    #df = df.loc[:,~df.columns.duplicated()].copy()
    df = df.drop_duplicates(subset='email')
    df = df.reset_index(drop=True)
    df.to_csv(path, mode='w', header=True)
    
    return df



In [8]:
bad_words = ['instagram', 'youtube', 'twitter', 'wiki', 'doordash', 'sentry', 'toasttab', 'yelp', 'restaurantji', 'ssf', 'doordash', '7-eleven', 'order.online', 'bevmo', 'shell', 'wixsite', 'ihg', '1-2-1marketing', 'fairviewevents', 'grubhub', 'smcmvcd', 'communityciviccampus', 'jcplatform', 'bluemagnetinteractive', 'hilton', 'kayak', 'web.archive', 'pubmed', 'sciencedirect', 'exploretock', 'bbb', 'garten', 'tripadvisor', 'getspoonfed', 'rss', 'jotformpro', 'instacart', 'ezcater', 'membersfirst', 'cvs', 'cbayresort', 'bing', 'andolasoft', 'buzzfeednews', 'buzzfeed', 'time', 'washingtonpost', 'eonline', 'warnermedia', 'nbcbayarea', 'bostonglobe', 'foxnews', 'tvguide', 'burlingame', 'speckmediainc', 'cityofpacifica', 'primetimeathleticclub', 'foursquare', 'userway', 'wiley', 'cnn', 'google', 'vitasta', 'worldcat', 'webstop', 'ktvu', 'parksconservancy', 'mystore411', 'creativecommons', 'spothopperapp', 'ethicspoint', 'deadline', 'variety', 'forbes', 'pedropointbrewing', 'sirivisacreative', 'wiley', 'tastecatering', 'spoton', 'socialsharksmarketing', 'yola', 'zendesk']
# removed 'facebook'

In [9]:
#df = get_info('AMERICAN BULL BAR & GRILL THE', 2, 'pt', 'test.csv', reject=bad_words)

In [10]:

for account in burlingame_accounts:
    df = get_info(account, 1, 'pt', 'burlingame_emails.csv', reject=bad_words)
    #time.sleep(2)

#df.head()