In [2]:
import time
import pandas as pd
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import html
import re


In [3]:
# Đọc dữ liệu trận đấu có chứa link Match Report
df = pd.read_csv("match_info.csv")  
match_links = df["Match_Report_Link"].dropna().tolist()
match_links = match_links

def fetch_match_details(link):
    """Cào thông tin chi tiết từ trang match_report"""
    try:
        detail_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        if not link.startswith("http"):
            link = "https://fbref.com" + link
        detail_driver.get(link)
        time.sleep(2)

        soup = BeautifulSoup(detail_driver.page_source, "html.parser")
        detail_driver.quit()

        # 📌 Cào dữ liệu thống kê nâng cao (Fouls, Corners, Tackles,...)
        stats_section = soup.find("div", {"id": "team_stats_extra"})
        if stats_section:
            stats = stats_section.find_all("div", class_=False)
            extracted_stats = [stat.text.strip() for stat in stats]
            filtered_data = [item for item in extracted_stats if len(item) <= 20]
        else:
            filtered_data = ["N/A"] * 36  # Tránh lỗi index khi không tìm thấy dữ liệu

        match_detail = {
            "Manager_Home": html.unescape(soup.select_one("#content > div.scorebox > div:nth-child(1) > div:nth-child(5)").text.split(":")[-1].strip()) if soup.select_one("#content > div.scorebox > div:nth-child(1) > div:nth-child(5)") else "N/A",
            "Manager_Away": html.unescape(soup.select_one("#content > div.scorebox > div:nth-child(2) > div:nth-child(5)").text.split(":")[-1].strip()) if soup.select_one("#content > div.scorebox > div:nth-child(2) > div:nth-child(5)") else "N/A",
            "Lineup_Home": re.findall(r'\((.*?)\)', soup.select_one("#a > table > tbody > tr:nth-child(1) > th").text)[0] if soup.select_one("#a > table > tbody > tr:nth-child(1) > th") else "N/A",
            "Lineup_Away": re.findall(r'\((.*?)\)', soup.select_one("#b > table > tbody > tr:nth-child(1) > th").text)[0] if soup.select_one("#b > table > tbody > tr:nth-child(1) > th") else "N/A",
            "Fouls_Home": filtered_data[0], "Fouls_Away": filtered_data[2],
            "Corners_Home": filtered_data[3], "Corners_Away": filtered_data[5],
            "Crosses_Home": filtered_data[6], "Crosses_Away": filtered_data[8],
            "Touches_Home": filtered_data[9], "Touches_Away": filtered_data[11],
            "Tackles_Home": filtered_data[12], "Tackles_Away": filtered_data[14],
            "Possession_Home": soup.select_one("#team_stats > table > tbody > tr:nth-child(3) > td:nth-child(1) > div > div:nth-child(1) > strong").text.strip() if soup.select_one("#team_stats > table > tbody > tr:nth-child(3) > td:nth-child(1) > div > div:nth-child(1) > strong") else "N/A",
            "Possession_Away": soup.select_one("#team_stats > table > tbody > tr:nth-child(3) > td:nth-child(2) > div > div:nth-child(1) > strong").text.strip() if soup.select_one("#team_stats > table > tbody > tr:nth-child(3) > td:nth-child(2) > div > div:nth-child(1) > strong") else "N/A",
            "Shots_on_Target_Home": soup.select_one("#team_stats > table > tbody > tr:nth-child(7) > td:nth-child(1) > div > div:nth-child(1) > strong").text.strip() if soup.select_one("#team_stats > table > tbody > tr:nth-child(7) > td:nth-child(1) > div > div:nth-child(1) > strong") else "N/A",
            "Shots_on_Target_Away": soup.select_one("#team_stats > table > tbody > tr:nth-child(7) > td:nth-child(2) > div > div:nth-child(1) > strong").text.strip() if soup.select_one("#team_stats > table > tbody > tr:nth-child(7) > td:nth-child(2) > div > div:nth-child(1) > strong") else "N/A",
            "Yellow_Cards_Home": len(soup.select("#team_stats > table > tbody > tr:nth-child(11) > td:nth-child(1) .yellow_card")),
            "Yellow_Cards_Away": len(soup.select("#team_stats > table > tbody > tr:nth-child(11) > td:nth-child(2) .yellow_card")),
            "Red_Cards_Home": len(soup.select("#team_stats > table > tbody > tr:nth-child(11) > td:nth-child(1) .red_card")),
            "Red_Cards_Away": len(soup.select("#team_stats > table > tbody > tr:nth-child(11) > td:nth-child(2) .red_card")),
            "Match_Report_Link": link
        }

        return match_detail
    except Exception as e:
        print(f"Lỗi khi cào dữ liệu trận đấu {link}: {e}")
        return None

# 📌 Chạy vòng for để cào dữ liệu và nghỉ sau mỗi trận
match_details = []

for link in match_links:
    result = fetch_match_details(link)
    if result:
        match_details.append(result)
    print(f"⏸ Nghỉ 10 giây sau khi cào trận {link}...")
    time.sleep(10)  # Nghỉ 10 giây trước khi cào tiếp

# 📌 Chuyển danh sách thành DataFrame
df_details = pd.DataFrame(match_details)

# 📌 Lưu kết quả cuối cùng
df_details.to_csv("match_detail.csv", index=False)
print("✅ Đã lưu match_detail.csv với dữ liệu chi tiết!")



⏸ Nghỉ 10 giây sau khi cào trận /en/matches/cc5b4244/Manchester-United-Fulham-August-16-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/a1d0d529/Ipswich-Town-Liverpool-August-17-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/34557647/Newcastle-United-Southampton-August-17-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/4efc72e4/Nottingham-Forest-Bournemouth-August-17-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/71618ace/Everton-Brighton-and-Hove-Albion-August-17-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/c0e3342a/Arsenal-Wolverhampton-Wanderers-August-17-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/eac7c00b/West-Ham-United-Aston-Villa-August-17-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/b63822b9/Brentford-Crystal-Palace-August-18-2024-Premier-League...
⏸ Nghỉ 10 giây sau khi cào trận /en/matches/67a0c715/Chelsea-Manchester-City-August-18-

In [16]:
df_details = pd.read_csv('match_detail.csv')
pd.set_option('display.max_columns', None)
df_details['Match_Report_Link'] = df_details['Match_Report_Link'].str.replace('https://fbref.com', '', regex=False)

In [17]:
df_details.head(50)


Unnamed: 0,Manager_Home,Manager_Away,Lineup_Home,Lineup_Away,Fouls_Home,Fouls_Away,Corners_Home,Corners_Away,Crosses_Home,Crosses_Away,Touches_Home,Touches_Away,Tackles_Home,Tackles_Away,Possession_Home,Possession_Away,Shots_on_Target_Home,Shots_on_Target_Away,Yellow_Cards_Home,Yellow_Cards_Away,Red_Cards_Home,Red_Cards_Away,Match_Report_Link
0,Erik ten Hag,Marco Silva,4-2-3-1,4-2-3-1,12.0,10.0,7.0,8.0,18.0,21.0,640.0,540.0,21.0,24.0,55%,45%,36%,20%,2,3,0,0,/en/matches/cc5b4244/Manchester-United-Fulham-...
1,Kieran McKenna,Arne Slot,4-2-3-1,4-2-3-1,9.0,18.0,2.0,10.0,8.0,23.0,497.0,720.0,20.0,9.0,38%,62%,29%,28%,3,1,0,0,/en/matches/a1d0d529/Ipswich-Town-Liverpool-Au...
2,Eddie Howe,Russell Martin,4-3-3,3-5-2,15.0,16.0,3.0,12.0,12.0,30.0,317.0,788.0,16.0,13.0,23%,77%,33%,16%,2,4,1,0,/en/matches/34557647/Newcastle-United-Southamp...
3,,,,,,,,,,,,,,,,,,,0,0,0,0,/en/matches/4efc72e4/Nottingham-Forest-Bournem...
4,Sean Dyche,Fabian Hürzeler,4-2-3-1,4-2-3-1,8.0,8.0,1.0,5.0,17.0,14.0,499.0,711.0,23.0,21.0,40%,60%,11%,50%,1,1,1,0,/en/matches/71618ace/Everton-Brighton-and-Hove...
5,Mikel Arteta,Gary O'Neil,4-3-3,4-2-3-1,17.0,14.0,8.0,2.0,15.0,14.0,569.0,493.0,18.0,19.0,53%,47%,33%,33%,2,2,0,0,/en/matches/c0e3342a/Arsenal-Wolverhampton-Wan...
6,Lopetegui,Unai Emery,4-1-4-1,4-2-3-1,18.0,11.0,5.0,3.0,25.0,15.0,571.0,561.0,14.0,24.0,52%,48%,15%,20%,1,2,0,0,/en/matches/eac7c00b/West-Ham-United-Aston-Vil...
7,Thomas Frank,Oliver Glasner,4-3-3,3-4-3,6.0,15.0,4.0,7.0,17.0,14.0,542.0,640.0,19.0,22.0,46%,54%,56%,43%,1,5,0,0,/en/matches/b63822b9/Brentford-Crystal-Palace-...
8,Enzo Maresca,Pep Guardiola,4-2-3-1,3-2-4-1,12.0,9.0,4.0,3.0,18.0,11.0,629.0,664.0,20.0,16.0,48%,52%,30%,45%,1,1,0,0,/en/matches/67a0c715/Chelsea-Manchester-City-A...
9,,,,,,,,,,,,,,,,,,,0,0,0,0,/en/matches/62eea1d6/Leicester-City-Tottenham-...


In [9]:
df_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Manager_Home          50 non-null     object
 1   Manager_Away          50 non-null     object
 2   Lineup_Home           50 non-null     object
 3   Lineup_Away           50 non-null     object
 4   Fouls_Home            50 non-null     object
 5   Fouls_Away            50 non-null     object
 6   Corners_Home          50 non-null     object
 7   Corners_Away          50 non-null     object
 8   Crosses_Home          50 non-null     object
 9   Crosses_Away          50 non-null     object
 10  Touches_Home          50 non-null     object
 11  Touches_Away          50 non-null     object
 12  Tackles_Home          50 non-null     object
 13  Tackles_Away          50 non-null     object
 14  Possession_Home       50 non-null     object
 15  Possession_Away       50 non-null     obje

In [None]:
info = pd.read_csv('match_info.csv')
merge = pd.merge(info, df_details, on = 'Match_Report_Link', how = 'inner')
merge.head()


Unnamed: 0,Day,Date,Time,Home,xG_Home,Score,xG_Away,Away,Attendance,Venue,Referee,Match_Report_Link,Manager_Home,Manager_Away,Lineup_Home,Lineup_Away,Fouls_Home,Fouls_Away,Corners_Home,Corners_Away,Crosses_Home,Crosses_Away,Touches_Home,Touches_Away,Tackles_Home,Tackles_Away,Possession_Home,Possession_Away,Shots_on_Target_Home,Shots_on_Target_Away,Yellow_Cards_Home,Yellow_Cards_Away,Red_Cards_Home,Red_Cards_Away
0,Fri,2024-08-16,20:00 (02:00),Manchester Utd,2.4,1–0,0.4,Fulham,73297,Old Trafford,Robert Jones,/en/matches/cc5b4244/Manchester-United-Fulham-...,Erik ten Hag,Marco Silva,4-2-3-1,4-2-3-1,12.0,10.0,7.0,8.0,18.0,21.0,640.0,540.0,21.0,24.0,55%,45%,36%,20%,2,3,0,0
1,Sat,2024-08-17,12:30 (18:30),Ipswich Town,0.5,0–2,2.6,Liverpool,30014,Portman Road Stadium,Tim Robinson,/en/matches/a1d0d529/Ipswich-Town-Liverpool-Au...,Kieran McKenna,Arne Slot,4-2-3-1,4-2-3-1,9.0,18.0,2.0,10.0,8.0,23.0,497.0,720.0,20.0,9.0,38%,62%,29%,28%,3,1,0,0
2,Sat,2024-08-17,15:00 (21:00),Newcastle Utd,0.3,1–0,1.8,Southampton,52196,St James' Park,Craig Pawson,/en/matches/34557647/Newcastle-United-Southamp...,Eddie Howe,Russell Martin,4-3-3,3-5-2,15.0,16.0,3.0,12.0,12.0,30.0,317.0,788.0,16.0,13.0,23%,77%,33%,16%,2,4,1,0
3,Sat,2024-08-17,15:00 (21:00),Nott'ham Forest,1.3,1–1,1.2,Bournemouth,29763,The City Ground,Michael Oliver,/en/matches/4efc72e4/Nottingham-Forest-Bournem...,,,,,,,,,,,,,,,,,,,0,0,0,0
4,Sat,2024-08-17,15:00 (21:00),Everton,0.5,0–3,1.4,Brighton,39217,Goodison Park,Simon Hooper,/en/matches/71618ace/Everton-Brighton-and-Hove...,Sean Dyche,Fabian Hürzeler,4-2-3-1,4-2-3-1,8.0,8.0,1.0,5.0,17.0,14.0,499.0,711.0,23.0,21.0,40%,60%,11%,50%,1,1,1,0


In [20]:

merge.to_csv('full_match_info.csv')

In [22]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Day                   380 non-null    object 
 1   Date                  380 non-null    object 
 2   Time                  380 non-null    object 
 3   Home                  380 non-null    object 
 4   xG_Home               259 non-null    object 
 5   Score                 259 non-null    object 
 6   xG_Away               259 non-null    object 
 7   Away                  380 non-null    object 
 8   Attendance            257 non-null    object 
 9   Venue                 380 non-null    object 
 10  Referee               259 non-null    object 
 11  Match_Report_Link     380 non-null    object 
 12  Manager_Home          378 non-null    object 
 13  Manager_Away          378 non-null    object 
 14  Lineup_Home           257 non-null    object 
 15  Lineup_Away           2

In [24]:
merge.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Index: 255 entries, 0 to 258
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Day                   255 non-null    object 
 1   Date                  255 non-null    object 
 2   Time                  255 non-null    object 
 3   Home                  255 non-null    object 
 4   xG_Home               255 non-null    object 
 5   Score                 255 non-null    object 
 6   xG_Away               255 non-null    object 
 7   Away                  255 non-null    object 
 8   Attendance            255 non-null    object 
 9   Venue                 255 non-null    object 
 10  Referee               255 non-null    object 
 11  Match_Report_Link     255 non-null    object 
 12  Manager_Home          255 non-null    object 
 13  Manager_Away          255 non-null    object 
 14  Lineup_Home           255 non-null    object 
 15  Lineup_Away           255 no