In [None]:
from bs4 import BeautifulSoup
from tqdm import tqdm
from random import choice
import requests 
import time
import pandas as pd 
import numpy as np
import missingno as msno

In [None]:
def crawl_proxies() -> list:
    '''
    
    This function scrapes clarketm proxy-list for proxy addresses.
    
    The function returns a list of proxy addresses.
    
    '''
    
    #  scraping https://github.com/clarketm/proxy-list/blob/master/proxy-list-raw.txt
    proxy_url = 'https://github.com/clarketm/proxy-list/blob/master/proxy-list-raw.txt'
    proxy_html = requests.get(proxy_url)
    proxy_soup = BeautifulSoup(proxy_html.text, 'lxml')
    
    unclean_proxies = proxy_soup('td', class_ = 'blob-code blob-code-inner js-file-line')
    proxies = [proxy.text.strip() for proxy in unclean_proxies]
    
    return proxies
    

In [None]:
def get_randomproxy() -> dict:
    return {'http': choice(crawl_proxies())}

In [None]:
def job_crawler(city=None, start=0, end=0,snooze=10) -> pd.DataFrame:
    '''
    
    This function loops over a specified number of pages of the website page to be scraped.
    
    The function accepts the base URL of the website,page number to start scraping from, 
    page number to end scraping at, and the number of seconds to wait in-between scraping.
    
    The function returns nothing. This is more of a progress display for the scraping process.
    
    '''    
    
    #  empty list to stare dataframes to be concatenated
    scraped_df_list = []
    
    for page_num in tqdm(range(start, end+10, 10), desc='Scraping pages'):
        #  url dictionary for the seven cities that are being scraped
        #  just a side note, Indeed's URL is difficult to work with smh 
        cities_url_dict = {'manila': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=Manila&jlid=2d385e0e7a50644e&sort=date&start={page_num}&vjk=2e7fe3a30378027c',
                           'makati': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=Makati&jlid=e42eba2843e635b1&sort=date&start={page_num}&vjk=380b3f4742a8dcb1',
                           'taguig': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=Taguig&jlid=3e4c70dafd758056&sort=date&start={page_num}&vjk=a367e6c4c1cb7b8b',
                           'quezon': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=Quezon%20City&jlid=fb06069655d21c80&sort=date&start={page_num}&vjk=4179e8c746ff8463',
                           'pasig': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=Pasig&jlid=b249f65c60df5fc2&sort=date&start={page_num}&vjk=112e13939fc4dc04',
                           'mandaluyong': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=Mandaluyong&jlid=d1f10b8ea5946746&sort=date&start={page_num}&vjk=46535244932a7347',
                           'san juan': f'https://ph.indeed.com/jobs?q&l=National%20Capital%20Region&rbl=San%20Juan&jlid=aa25f5461d6d7365&sort=date&start={page_num}&vjk=24760d0b2c8fbbdc'} 
        
        #  appending specific page number to the base url
        #  but first, we have to identify which link to use
        if city == 'manila':
            specific_url = cities_url_dict['manila']
        elif city == 'makati':
            specific_url = cities_url_dict['makati']
        elif city == 'taguig':
            specific_url = cities_url_dict['taguig']
        elif city == 'quezon':
            specific_url = cities_url_dict['quezon']
        elif city == 'pasig':
            specific_url = cities_url_dict['pasig']
        elif city == 'mandaluyong':
            specific_url = cities_url_dict['mandaluyong']
        elif city == 'san juan':
            specific_url = cities_url_dict['san juan']
        else:
            print('Please specify a city')
            break
        
        #  retrieving the html of the specific url
        specific_html = requests.get(specific_url, proxies=get_randomproxy())
        
        #  status code of page: to determine if the page is accessible
        status_code = specific_html.status_code
        
        #  indicating that the process will start 
        if status_code != 200:
            print(f'Retrieval of {specific_url} failed with status code {status_code}')
            break
        print(f'Retrieving: {specific_url} --- Success!')
        
        #  function that retrieves data from the specific html and appends it to empty lists
        scraped_df_list.append(get_joblisting(specific_html))
        
        #  giving the spider a little break
        time.sleep(snooze)
    
    return concat_dataframelist(scraped_df_list)

In [None]:
def get_joblisting(specific_html=None) -> pd.DataFrame: 
    '''
    
    This function scrapes the lamudi.com.ph website for properties.
    
    The function accepts the retrieved html of a page.
    
    The function returns a dataframe containing the scraped data.
    
    '''
    
    #  empty lists to store data retrieved from page
    listing_list = []
    location_list = []
    salary_list = []
    company_list = []
    
    #  list of list to pass to dataframe_fromlists()
    features_list = [listing_list,
                     location_list,
                     salary_list,
                     company_list,
                     ]
    
    #  soupify the html passed
    joblist_soup = BeautifulSoup(specific_html.text, 'lxml')
    
    #  data of each proprety is concentrated in this html div tag with the class: ListingCell-AllInfo ListingUnit
    jobinfo_group = joblist_soup.find_all('td', class_ = 'resultContent')
    
    #  looping through each property listing from propinfo_group
    for job in jobinfo_group:
        
        #  retrieve listing name 
        listing = job.find('span').next_element.next_element.text.strip()
        
        #  retrieve location 
        location = job.find('div', class_ = 'companyLocation').text.strip()
        
        #  retrieve salary
        try:  
            salary = job.find('div', class_ = 'salary-snippet').next_element.text.strip()           
        except AttributeError:
            salary = np.nan
        
        #  retrieve company
        company = job.find('span', class_ = 'companyName').text.strip()

        #  appending data to empty lists 
        listing_list.append(listing)
        location_list.append(location)
        salary_list.append(salary)
        company_list.append(company)


    #  returning a dataframe
    return dataframe_fromlists(features_list)
    

In [None]:
def dataframe_fromlists(features_list) -> pd.DataFrame:
    '''
    
    This function creates a dataframe from the data lists.

    The function accepts a list of lists containing the data to be converted to a dataframe.
    
    The function returns a dataframe.
    
    '''
    
    df_format = {'listing': features_list[0], 
                'location': features_list[1], 
                'salary': features_list[2], 
                'company': features_list[3], 
                }
    
    #  creating a dataframe from the data lists
    df = pd.DataFrame(data = df_format)
    
    #  returning the dataframe
    return df

In [None]:
def concat_dataframelist(scraped_df_list) -> pd.DataFrame:
    '''
    
    This function concatenates a list of dataframes.

    The function accepts a list of dataframes to be concatenated.
    
    The function returns a dataframe.
    
    '''
    
    #  establishing first dataframe point
    prime_df = scraped_df_list[0]
    
    #  looping through remaining dataframes to concatenate
    for i in range(len(scraped_df_list)-1):
        prime_df = pd.concat([prime_df,scraped_df_list[i+1]], ignore_index=True)
    
    return prime_df 