In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import date
import asyncio
import aiohttp

In [None]:
pages = [page + 1 for page in range(20)]
url = 'https://proxyhub.me/en/ir-free-proxy-list.html'

start = time.time()

##########################
async def get_proxies(session, url, page):
    
    tables = []
    cookies = {
        'page': str(page),
        'anonymity': 'all',
    }

    headers = {
        'authority': 'proxyhub.me',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'accept-language': 'en-US,en;q=0.7',
        'cache-control': 'max-age=0',
        # Requests sorts cookies= alphabetically
        'cookie': 'page='+str(page)+'; anonymity=all',
        'referer': 'https://proxyhub.me/en/ir-free-proxy-list.html/',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'sec-gpc': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
    }

    async with session.get(url, cookies=cookies, headers=headers) as response:
        response = await response.content
        soup = BeautifulSoup(response, 'html.parser')
        table = soup.find('table')
        tables.append(table)
    await asyncio.sleep(2)
    print("Fetch Successful")
    return tables

##########################

async def get_tasks(session):
    tasks = []
    for i in range(len(pages)):
        task = asyncio.create_task(get_proxies(session, url, pages[i]))
        tasks.append(task)
    results = await asyncio.gather(*tasks)
    print("Task Successful")
    return results


##########################

async def main():
    
    async with aiohttp.ClientSession() as session:

        tables = await get_tasks(session)

        return tables
    

##########################

st = time.time()
tables = await main()
et = time.time()

print("Elapsed Time: ", et-st)
print("\n")
print("Number of Tables: "len(tables))

In [None]:
column_name = []

for header in soup.findChildren('th'):
    column_name.append(header.string)

In [None]:
data = []

for i in range(len(a)):
    my_table = tables[i]
    # You can find children with multiple tags by passing a list of strings
    rows = my_table.findChildren(['th', 'tr'])
    data = []
    for row in rows:
        cells = row.findChildren('td')
        for cell in cells:
            value = cell.string
            data.append(value)
        
chunk_size = 6
data = pd.DataFrame(data=[data[i:i + chunk_size] for i in range(0, len(data), chunk_size)], columns = column_name)
data

In [None]:
data['Country'] = data['Country'].apply(lambda x:'Iran')
data.drop('City', axis=1,inplace=True)
data['URL'] = dict('http://' + data['IP'].astype(str) + ':' + data['Port'].astype(str))
print(data)

In [None]:
# Create a pool of proxies
proxies = {v for v in data['URL']}


url = 'https://www.google.com/'

# Iterate the proxies and check if it is working.
for proxy in proxies:
    try:

        page = requests.get(
        url, proxies={"http": proxy, "https": proxy}, timeout=60)

        # Prints Proxy server IP address if proxy is alive.
        print("Status OK with {}, Output:".format(proxy), page.status_code)

    except OSError as e:

        # Proxy returns Connection error
        data = data[data.URL != proxy]
        print(e)
        



In [None]:
data.to_csv('proxies_{}.csv'.format(date.today().strftime('%Y-%m-%d')))