In [1]:
!pwd

/home/niranjan/MY-WORLD/DataScience/Code-Repository/Company-Email-Crawler


In [17]:
!ls

CompanyEmail-Crawler-v1.ipynb  niki-Email.csv  README.md  tesla-Email.csv


In [3]:
import logging
import os
import pandas as pd
import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from googlesearch import search
logging.getLogger('scrapy').propagate = False

In [4]:
def get_urls(tag, n, language):
    # This functions gets the search string urls from the google
    urls = [url for url in search(tag, stop=n, lang=language)][:n]
    return urls

In [5]:
class MailSpider(scrapy.Spider):
    
    name = 'email'
    
    #Inheriting the scrapy.Scider properties into MailSpider
    #Spiders are custom classes written by Scrapy users to parse responses and extract items (aka scraped items)
    #from them or additional URLs (requests) to follow. Each spider is able to handle a specific domain
    #(or group of domains)
    
    def parse(self, response):
        
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))
        
        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link) 
            
    def parse_link(self, response):
        
        for word in self.reject:
            if word in str(response.url):
                return
            
        html_text = str(response.text)
        mail_list = re.findall(r'\w+@\w+\.{1}\w+', html_text)
    

        dic = {'email': mail_list, 'link': str(response.url)}
        df = pd.DataFrame(dic)
        
        df.to_csv(self.path, mode='a', header=False)
        df.to_csv(self.path, mode='a', header=False)


In [6]:
def ask_user(question):
    response = input(question + ' y/n' + '\n')
    if response == 'y':
        return True
    else:
        return False
def create_file(path):
    response = False
    if os.path.exists(path):
        response = ask_user('File already exists, replace?')
        if response == False: return 
    
    with open(path, 'wb') as file: 
        file.close()

In [7]:
def get_info(tag, n, language, path, reject=[]):
    
    create_file(path)
    df = pd.DataFrame(columns=['email', 'link'], index=[0])
    df.to_csv(path, mode='w', header=True)
    
    print('Collecting Google urls...')
    google_urls = get_urls(tag, n, language)
    
    print('Searching for emails...')
    process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
    process.crawl(MailSpider, start_urls=google_urls, path=path, reject=reject)
    process.start()
    
    print('Cleaning emails...')
    df = pd.read_csv(path, index_col=0)
    df.columns = ['email', 'link']
    df = df.drop_duplicates(subset='email')
    df = df.reset_index(drop=True)
    df.to_csv(path, mode='w', header=True)
    
    return df

In [8]:
cname = str(input('Enter companey name:'))
get_urls(cname,1,'en')


Enter companey name:tesla


['https://electrek.co/2020/01/23/tesla-official-loopholes-stupid-laws-to-sale-michigan/?sa=X&ved=2ahUKEwi2iJOppJvnAhUXxzgGHZJmCKkQqOcBMAB6BAgBEAI']

In [9]:
bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki']
df = get_info(cname, 1, 'pt', cname+'-Email.csv', reject=bad_words)

Collecting Google urls...
Searching for emails...
Cleaning emails...


In [10]:
df.head()

Unnamed: 0,email,link
0,,
1,portrait@2.jpg,https://www.tesla.com/roadster
2,hero@2.jpg,https://www.tesla.com/roadster
3,mobile@2.jpg,https://www.tesla.com/roadster
4,model3_top_view@2x.jpg,https://www.tesla.com/model3


In [12]:
c_df = df[df['email'].str.contains(cname, na=False)]['email']
c_df

13                                 forums@tesla.com
18                                  press@tesla.com
19                                  Press@tesla.com
20                                eupress@tesla.com
21                                EUPress@tesla.com
22                              apacpress@tesla.com
23                              APACPress@tesla.com
24                                    DPO@tesla.com
25                                privacy@tesla.com
26                          ServiceHelpNA@tesla.com
27                   charginginstallation@tesla.com
28                            resolutions@tesla.com
29                                  legal@tesla.com
30    c2ba986ebc5a4649aa605dd8b7e942c2@errlog.tesla
39                             southkorea@tesla.com
41                   accommodationrequest@tesla.com
55                           tesla_community@2x.jpg
Name: email, dtype: object

In [13]:
c_df.to_json(orient='split')

'{"name":"email","index":[13,18,19,20,21,22,23,24,25,26,27,28,29,30,39,41,55],"data":["forums@tesla.com","press@tesla.com","Press@tesla.com","eupress@tesla.com","EUPress@tesla.com","apacpress@tesla.com","APACPress@tesla.com","DPO@tesla.com","privacy@tesla.com","ServiceHelpNA@tesla.com","charginginstallation@tesla.com","resolutions@tesla.com","legal@tesla.com","c2ba986ebc5a4649aa605dd8b7e942c2@errlog.tesla","southkorea@tesla.com","accommodationrequest@tesla.com","tesla_community@2x.jpg"]}'

In [14]:
from difflib import SequenceMatcher
def similar(lcemail, memail):
    #similar function gives the ratio between two strings
    return SequenceMatcher(None, lcemail, memail).ratio()

In [15]:
elist = c_df.tolist()
len(elist)
elist

['forums@tesla.com',
 'press@tesla.com',
 'Press@tesla.com',
 'eupress@tesla.com',
 'EUPress@tesla.com',
 'apacpress@tesla.com',
 'APACPress@tesla.com',
 'DPO@tesla.com',
 'privacy@tesla.com',
 'ServiceHelpNA@tesla.com',
 'charginginstallation@tesla.com',
 'resolutions@tesla.com',
 'legal@tesla.com',
 'c2ba986ebc5a4649aa605dd8b7e942c2@errlog.tesla',
 'southkorea@tesla.com',
 'accommodationrequest@tesla.com',
 'tesla_community@2x.jpg']

In [16]:
#Ebucket(Email bucket) has the all companies email format keys words so that we can get better accuracy 
Ebucket = ['info','support','sales','press','customercare','emeasales','media','contacts']

#Dbucket (Domain bucket) has the all the domain name extentions
Dbucket = ['.in','.com','.co.in','.org','.io','.ai','co','.app','.net','.int','.edu','.gov','.mil']
cemail=[]

for em in range(len(elist)):
    for eb in range(len(Ebucket)):
        for db in range(len(Dbucket)):
            if similar(elist[em],Ebucket[eb]+'@'+cname+Dbucket[db]) > .89:
                cemail=elist[em]
        
cemail


#The input is 'company name'- output: emailid of the company- this CompanyEmail-Crawler works for almost
#all companies.


'eupress@tesla.com'