# Web Scraping with Python BeautifulSoup
---

### Local HTML Scraping

In [None]:
from bs4 import BeautifulSoup

with open('home.html', 'r') as html_file:
    content = html_file.read()
    print(content)

<!doctype html>
<html lang="en">
   <head>
      <meta charset="utf-8">
      <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
      <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" integrity="sha384-JcKb8q3iqJ61gNV9KGb8thSsNjpSL0n8PARn9HuZOnIxN0hoP+VmmDGMN5t9UJ0Z" crossorigin="anonymous">
      <title>My Courses</title>
   </head>
   <body>
      <h1>Hello, Start Learning!</h1>
      <div class="card" id="card-python-for-beginners">
         <div class="card-header">
            Python
         </div>
         <div class="card-body">
            <h5 class="card-title">Python for beginners</h5>
            <p class="card-text">If you are new to Python, this is the course that you should buy!</p>
            <a href="#" class="btn btn-primary">Start for 20$</a>
         </div>
      </div>
      <div class="card" id="card-python-web-development">
         <div class="card-header">
            Pyt

In [None]:
soup = BeautifulSoup(content, 'lxml')
courses = soup.find_all('div', class_='card')
for course in courses:
    title = course.h5.text
    price = course.a.text.split()[-1]
    print(f"Title: {title}\nPrice: {price}")

Title: Python for beginners
Price: 20$
Title: Python Web Development
Price: 50$
Title: Python Machine Learning
Price: 100$


### Website Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation="
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')

job_list = []
for job in jobs:
    job_title = job.h2.text.strip()
    company_name = job.h3.text.strip()
    place = job.find('ul', class_='top-jd-dtl clearfix').text.strip().split()[-1]
    skills = job.find('span', class_='srp-skills').text.strip().replace('  ,  ', ', ')
    more_info = job.h2.a['href']
    job_list.append((job_title, company_name, place, skills, more_info))

display(pd.DataFrame(job_list, columns=['job_title', 'company_name', 'place', 'skills', 'more_info']))

Unnamed: 0,job_title,company_name,place,skills,more_info
0,Python,Surya Informatics Solutions Pvt. Ltd.,Chennai,"python, web technologies, linux, mobile, mysql...",https://www.timesjobs.com/job-detail/python-su...
1,Python Developer,Ivan Infotech Pvt. Ltd.,Kolkata,"rest, python, security, debugging",https://www.timesjobs.com/job-detail/python-de...
2,Python Engineer,east india securities ltd.,Kolkata,"python, hadoop, machine learning",https://www.timesjobs.com/job-detail/python-en...
3,Python Developer,TECHNOPARK TRIVANDRUM,Thiruvananthapuram,"rest, python, devops, shell scripting",https://www.timesjobs.com/job-detail/python-de...
4,Python Developer,art technology and software india pvt ltd,Ernakulam,"rest, python, database, django, api",https://www.timesjobs.com/job-detail/python-de...
5,Python intern,sjain ventures,Raipur,"python, web developer, web services",https://www.timesjobs.com/job-detail/python-in...
6,Python developer,TandA HR Solutions,Chandigarh,"Python, Django, Flask, unit testing",https://www.timesjobs.com/job-detail/python-de...
7,Python Developer,TandA HR Solutions,Chandigarh,"python, git, django, GIT Hub",https://www.timesjobs.com/job-detail/python-de...
8,Python developer,TandA HR Solutions,Chandigarh,"Python, Django, Flask, GIT Hub",https://www.timesjobs.com/job-detail/python-de...
9,Python Engineer,brickred ( 3pillar global ),location_on,"python, django, web development",https://www.timesjobs.com/job-detail/python-en...


# Web Scraping with Python Selenium
---
Web Scraping bots, Browser Automation, Testing

In [None]:
import os
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import pandas as pd
import time

os.environ['PATH'] += r"C:/Program Files/ChromeDriver"
url = "https://www.adamchoi.co.uk/overs/detailed"

driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)

# button
allmatches_button = driver.find_element(By.XPATH, '//label[@analytics-event="All matches"]')
allmatches_button.click()
# dropdown
country_dropdown = Select(driver.find_element(By.ID, 'country'))
country_dropdown.select_by_visible_text('Japan')
season_dropdown = Select(driver.find_element(By.ID, 'season'))
season_dropdown.select_by_visible_text('2021')

data = []
date = []
home_team = []
score = []
away_team = []
matches = driver.find_elements(By.TAG_NAME, 'tr')
for match in matches:
    data.append(match.text)
    date.append(match.find_element(By.XPATH, './td[1]').text)  # '//tr/td[1]'
    home_team.append(match.find_element(By.XPATH, './td[2]').text) 
    score.append(match.find_element(By.XPATH, './td[3]').text) 
    away_team.append(match.find_element(By.XPATH, './td[4]').text) 

driver.quit()

df = pd.DataFrame({'date': date, 'home_team': home_team, 'score': score, 'away_team': away_team})
df

Unnamed: 0,date,home_team,score,away_team
0,28-02-2021,Avispa Fukuoka,1 - 2,Nagoya Grampus
1,06-03-2021,Shimizu S-Pulse,2 - 2,Avispa Fukuoka
2,10-03-2021,Avispa Fukuoka,1 - 3,Yokohama F-Marinos
3,13-03-2021,Tokushima Vortis,1 - 2,Avispa Fukuoka
4,17-03-2021,Avispa Fukuoka,1 - 0,Kashima Antlers
...,...,...,...,...
755,03-11-2021,Yokohama FC,0 - 0,Sagan Tosu
756,07-11-2021,Avispa Fukuoka,1 - 1,Yokohama FC
757,20-11-2021,Yokohama FC,0 - 2,Vissel Kobe
758,27-11-2021,Oita,2 - 0,Yokohama FC


#### Extract the data each line into 4 columns using Regex

In [None]:
dd = pd.DataFrame(data, columns=['data'])

regex = r"^(\d{2}-\d{2}-\d{4})\s+(.+)\s+(\d - \d)\s+(.+)$"
dd = dd['data'].str.extract(regex, expand=True)

dd.columns = ['date', 'home_team', 'score', 'away_team']

dd

Unnamed: 0,date,home_team,score,away_team
0,28-02-2021,Avispa Fukuoka,1 - 2,Nagoya Grampus
1,06-03-2021,Shimizu S-Pulse,2 - 2,Avispa Fukuoka
2,10-03-2021,Avispa Fukuoka,1 - 3,Yokohama F-Marinos
3,13-03-2021,Tokushima Vortis,1 - 2,Avispa Fukuoka
4,17-03-2021,Avispa Fukuoka,1 - 0,Kashima Antlers
...,...,...,...,...
755,03-11-2021,Yokohama FC,0 - 0,Sagan Tosu
756,07-11-2021,Avispa Fukuoka,1 - 1,Yokohama FC
757,20-11-2021,Yokohama FC,0 - 2,Vissel Kobe
758,27-11-2021,Oita,2 - 0,Yokohama FC


# Web Scraping with Python Pandas
---

In [None]:
import pandas as pd
from string import ascii_uppercase as alphabet
import pickle

In [None]:
print(alphabet)

ABCDEFGHIJKLMNOPQRSTUVWXYZ


In [None]:
tables = pd.read_html('https://en.wikipedia.org/wiki/2022_FIFA_World_Cup')

dict_table = {}
for letter, i in zip(alphabet, range(9, 65, 7)):
    df = tables[i]
    df.rename(columns={df.columns[1]: 'Team'}, inplace=True)
    df.pop('Qualification')
    dict_table[f'Group {letter}'] = df

In [None]:
dict_table['Group A']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Netherlands,3,2,1,0,5,1,+4,7
1,2,Senegal,3,2,0,1,5,4,+1,6
2,3,Ecuador,3,1,1,1,4,3,+1,4
3,4,Qatar (H),3,0,0,3,1,7,−6,0


In [None]:
dict_table['Group B']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,England,3,2,1,0,9,2,+7,7
1,2,United States,3,1,2,0,2,1,+1,5
2,3,Iran,3,1,0,2,4,7,−3,3
3,4,Wales,3,0,1,2,1,6,−5,1


In [None]:
dict_table['Group C']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Argentina,3,2,0,1,5,2,+3,6
1,2,Poland,3,1,1,1,2,2,0,4
2,3,Mexico,3,1,1,1,2,3,−1,4
3,4,Saudi Arabia,3,1,0,2,3,5,−2,3


In [None]:
with open('data/dict_table', 'wb') as output:
    pickle.dump(dict_table, output)

### Scrape the FIFA World Cup Data from Wikipedia

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978, 1982, 1986, 
         1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018]

def get_matches(year):
    url = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'lxml')

    home = []
    score = []
    away = []
    matches = soup.find_all('div', class_='footballbox')
    for match in matches:
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())

    dict_football = {'home': home, 'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football.insert(0, 'year', year)
    return df_football

fifa = [get_matches(year) for year in years]
df_fifa = pd.concat(fifa, ignore_index=True)
df_fifa

Unnamed: 0,year,home,score,away
0,1930,France,4–1,Mexico
1,1930,Argentina,1–0,France
2,1930,Chile,3–0,Mexico
3,1930,Chile,1–0,France
4,1930,Argentina,6–3,Mexico
...,...,...,...,...
896,2018,Russia,2–2 (a.e.t.),Croatia
897,2018,France,1–0,Belgium
898,2018,Croatia,2–1 (a.e.t.),England
899,2018,Belgium,2–0,England
