# Web Scraping with Python Selenium
---
Web Scraping bots, Browser Automation, Testing

In [40]:
import os
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import pandas as pd
import time

os.environ['PATH'] += r"C:/Program Files/ChromeDriver"
url = "https://www.adamchoi.co.uk/overs/detailed"

driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)

# button
allmatches_button = driver.find_element(By.XPATH, '//label[@analytics-event="All matches"]')
allmatches_button.click()
# dropdown
country_dropdown = Select(driver.find_element(By.ID, 'country'))
country_dropdown.select_by_visible_text('Japan')
season_dropdown = Select(driver.find_element(By.ID, 'season'))
season_dropdown.select_by_visible_text('2021')

data = []
date = []
home_team = []
score = []
away_team = []
matches = driver.find_elements(By.TAG_NAME, 'tr')
for match in matches:
    data.append(match.text)
    date.append(match.find_element(By.XPATH, './td[1]').text)  # '//tr/td[1]'
    home_team.append(match.find_element(By.XPATH, './td[2]').text) 
    score.append(match.find_element(By.XPATH, './td[3]').text) 
    away_team.append(match.find_element(By.XPATH, './td[4]').text) 

driver.quit()

df = pd.DataFrame({'date': date, 'home_team': home_team, 'score': score, 'away_team': away_team})
df

Unnamed: 0,date,home_team,score,away_team
0,28-02-2021,Avispa Fukuoka,1 - 2,Nagoya Grampus
1,06-03-2021,Shimizu S-Pulse,2 - 2,Avispa Fukuoka
2,10-03-2021,Avispa Fukuoka,1 - 3,Yokohama F-Marinos
3,13-03-2021,Tokushima Vortis,1 - 2,Avispa Fukuoka
4,17-03-2021,Avispa Fukuoka,1 - 0,Kashima Antlers
...,...,...,...,...
755,03-11-2021,Yokohama FC,0 - 0,Sagan Tosu
756,07-11-2021,Avispa Fukuoka,1 - 1,Yokohama FC
757,20-11-2021,Yokohama FC,0 - 2,Vissel Kobe
758,27-11-2021,Oita,2 - 0,Yokohama FC


#### Extract the data each line into 4 columns using Regex

In [47]:
dd = pd.DataFrame(data, columns=['data'])

regex = r"^(\d{2}-\d{2}-\d{4})\s+(.+)\s+(\d - \d)\s+(.+)$"
dd = dd['data'].str.extract(regex, expand=True)

dd.columns = ['date', 'home_team', 'score', 'away_team']

dd

Unnamed: 0,date,home_team,score,away_team
0,28-02-2021,Avispa Fukuoka,1 - 2,Nagoya Grampus
1,06-03-2021,Shimizu S-Pulse,2 - 2,Avispa Fukuoka
2,10-03-2021,Avispa Fukuoka,1 - 3,Yokohama F-Marinos
3,13-03-2021,Tokushima Vortis,1 - 2,Avispa Fukuoka
4,17-03-2021,Avispa Fukuoka,1 - 0,Kashima Antlers
...,...,...,...,...
755,03-11-2021,Yokohama FC,0 - 0,Sagan Tosu
756,07-11-2021,Avispa Fukuoka,1 - 1,Yokohama FC
757,20-11-2021,Yokohama FC,0 - 2,Vissel Kobe
758,27-11-2021,Oita,2 - 0,Yokohama FC
