<div class="alert alert-block alert-success">
    
This notebook shows how to create or use an email spider, also known as an email scraper or email extractor. It is 
    **important to note**
    that using email spiders to scrape or extract email addresses from websites or databases without consent is often considered unethical and may be illegal in some jurisdictions. It is always recommended to obtain permission from the website or database owner before collecting email addresses. Additionally, sending unsolicited emails or spam is generally prohibited and can result in penalties and legal consequences.
    
</div>

In [None]:
import logging
import os
import pandas as pd
import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from googlesearch import search
from requests_html import HTMLSession

logging.getLogger('scrapy').propagate = False
#avoid getting too many logs and warnings when using Scrapy inside Jupyter Notebook.

In [None]:
def get_urls(tag, n, language):
    urls = [url for url in search(tag, stop=n, lang=language)][:n]
    return urls

In [None]:
class MailSpider(scrapy.Spider):
    
    name = 'email'
    
#because in most websites contact information is not going to be found straight in the main page, 
    #but rather in a contact page or so. Therefore, in the first parse method we’re running a link 
    #extractor object (LxmlLinkExtractor), 
    #that checks for new URLs inside a source
    
    def parse(self, response):
        
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))
        
        #the one responsible for sending links from one parse method to another. 
        #This is accomplished by a callback argument that defines to which method the
        #request URL must be sent to.
        
        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link) 
            
    def parse_link(self, response):
        
        for word in self.reject:
            if word in str(response.url):
                return
            
        html_text = str(response.text)
        
        mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)

        dic = {'email': mail_list, 'link': str(response.url)}
        df = pd.DataFrame(dic)
        
        df.to_csv(self.path, mode='a', header=False)
        df.to_csv(self.path, mode='a', header=False)

In [None]:
#Save those emails in a CSV file
def ask_user(question):
    response = input(question + ' y/n' + '\n')
    if response == 'y':
        return True
    else:
        return False
def create_file(path):
    response = False
    if os.path.exists(path):
        response = ask_user('File already exists, replace?')
        if response == False: return 
    
    with open(path, 'wb') as file: 
        file.close()

In [None]:
def get_info(tag, n, language, path, reject=[]):
    
    create_file(path)
    df = pd.DataFrame(columns=['email', 'link'], index=[0])
    df.to_csv(path, mode='w', header=True)
    
    print('Collecting Google urls...')
    google_urls = get_urls(tag, n, language)
    
    print('Searching for emails...')
    process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
    process.crawl(MailSpider, start_urls=google_urls, path=path, reject=reject)
    process.start()
    
    print('Cleaning emails...')
    df = pd.read_csv(path, index_col=0)
    df.columns = ['email', 'link']
    df = df.drop_duplicates(subset='email')
    df = df.reset_index(drop=True)
    df.to_csv(path, mode='w', header=True)
    
    return df

In [None]:
bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki']
SearchWords=input("Please Enter Words You need to search for email or links: ")

In [None]:
get_urls(SearchWords, 2, 'en')
df = get_info(SearchWords, 2, 'en', 'k.csv')#, reject=bad_words)