# The Problem

Our client wants to extract all suppliers name, address, phone, email and website from https://www.pfonline.com/suppliers 

### Loading required libraries, defyning headers and functions

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

headers = { 
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip', 
'DNT' : '1', # Do Not Track Request Header 
'Connection' : 'close'
}

def get_page(url):
    page = requests.get(url, headers = headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    return soup

### Getting all links from all suppliers types

In [2]:
# getting the main page
soup = get_page('https://www.pfonline.com/suppliers')


# getting the link for each supllier segment
links = []

a_tags = soup.body.main.find(id = 'directoryBrowse').find('div',{'class':'row'}).find_all('a', href = True)

for a in range(0,len(a_tags)):
    
    link = 'https://www.pfonline.com'+a_tags[a]['href']
    
    if link==link.replace('category',''):
        links.append(link)

In [3]:
links[0:10]

['https://www.pfonline.com/suppliers/product/755',
 'https://www.pfonline.com/suppliers/product/1949',
 'https://www.pfonline.com/suppliers/product/4313',
 'https://www.pfonline.com/suppliers/product/4302',
 'https://www.pfonline.com/suppliers/product/3387',
 'https://www.pfonline.com/suppliers/product/4432',
 'https://www.pfonline.com/suppliers/product/2286',
 'https://www.pfonline.com/suppliers/product/5005',
 'https://www.pfonline.com/suppliers/product/3186',
 'https://www.pfonline.com/suppliers/product/2514']

### For each supplier type get link for each supplier

In [4]:
final_links = []
 
## going through each segment links
for p in links:
    
    # getting the segment page
    soup_product = get_page(p)

    a_tags = soup_product.body.main.find('div', {'class':'col-xs-12'}).find_all('a', href = True)

    for a in range(0,len(a_tags)):

        if 'suppliers' in a_tags[a]['href']:
            link = 'https://www.pfonline.com'+a_tags[a]['href']

            if (link in final_links)==False:
                final_links.append(link)

    time.sleep(1)

In [5]:
final_links[0:10]

['https://www.pfonline.com/suppliers/reliant-finishing-systems',
 'https://www.pfonline.com/suppliers/venjakob-maschinenbau-co-kg',
 'https://www.pfonline.com/suppliers/a-e-aubin-company',
 'https://www.pfonline.com/suppliers/advanced-finishing-technologies',
 'https://www.pfonline.com/suppliers/bel-air-finishing-supply',
 'https://www.pfonline.com/suppliers/best-technology',
 'https://www.pfonline.com/suppliers/blastec',
 'https://www.pfonline.com/suppliers/clemco-industries',
 'https://www.pfonline.com/suppliers/comco-inc',
 'https://www.pfonline.com/suppliers/cool-clean-technologies']

### Getting all requested data from each supplier

In [6]:
# defyning a database to storage all final data
data_base = pd.DataFrame(columns = ['supplier','address','phone','email','site'])
    
# going through each supplier link 
for fl in final_links:

    if 'https://www.pfonline.com/suppliers' in fl:
        # getting the supplier data page
        final_soup = get_page(fl)

        # getting the supplier name
        sup = final_soup.find('h1',{'class':'txt-center'})

        if sup!=None:
            supplier = sup.text
        else:
            supplier = 'NA'

        # getting the supplier address
        add = final_soup.find('p',{'class':'font-bold'})

        if add!=None:
            if len(add.text.split())>7:

                address = " ".join(add.text.split()[0:6])
            else:

                address = " ".join(final_soup.find('p',{'class':'font-weight-bold'}).text.split()[0:8])

        else:

                address = " ".join(final_soup.find('p',{'class':'font-weight-bold'}).text.split()[0:11])

        # getting supplier phone number
        ph = final_soup.find('span',{'class':'d-block company-contact-phone'})

        if ph!=None:
            phone = ph.text.replace('\r\n','').replace('                                            ','').replace('                                        ','').replace('\n','').replace('\n','')
        else:
            phone = 'NA'

        # getting supplier email

        mail = final_soup.find('span',{'class':'d-block company-contact-email'})

        if mail!=None:
            email = mail.text.replace('\n','')
        else:
            email = 'NA'


        # getting supplier site
        website = final_soup.find_all('a',{'class':'font-weight-bold lg word-break-all'})

        if website!=None and len(website)!=0:
            site = website[0]['href']
        else:
            site = 'NA'

        data_base = data_base.append({'supplier':supplier,
                                      'address':address,
                                      'phone':phone,
                                      'email':email,
                                      'site':site}, ignore_index=True)

In [7]:
data_base.head()

Unnamed: 0,supplier,address,phone,email,site
0,Reliant Finishing Systems,"2541 Hwy. 67 S Somerville, AL",888-770-0021 | 256-355-9000,info@reliantfinishingsystems.com,http://www.reliantfinishingsystems.com
1,Venjakob Maschinenbau GmbH Co. KG,"Augsburger Strasse 2-6 Rheda-Wiedenbrück, 3337...",49 5242 9603 0,info@venjakob.de,http://www.venjakob-nutro.com
2,A. E. Aubin Company,"PO Box 899 New Milford, CT 06776 US",800-423-0697 | 860-350-3377 ...,sales@aeaubin.com,http://www.aeaubin.com
3,Advanced Finishing Technologies,"835 W River Ctr. Grand Rapids, MI 49321",866-478-1338 | 616-785-0400 ...,sales@advancedfinish.com,http://www.advancedfinish.com
4,Bel Air Finishing Supply,101 Circuit Dr. Quonset Industrial Park North ...,401-667-7902,info@belairfinishing.com,http://www.belairfinishing.com


In [9]:
data_base.shape

(681, 5)

In [None]:
data_base.to_csv('suppliers.csv')