In [126]:
import pandas as pd 
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
import unicodedata
import re
from pymongo import MongoClient
import gspread
from dotenv import load_dotenv
import os
from itertools import chain

Team name constants

In [71]:
team_tags_dico = {
    "SCL" : ["scl","scald"],
    "IWG" : ["iwg"]
}

Instanciate Selenium web browser

In [72]:
driver_options = webdriver.FirefoxOptions()
driver_options.add_argument("--headless")
driver = webdriver.Firefox(options=driver_options)

In [73]:
def getBlue_picks_bans(soup,name):
    picks = soup.find(attrs={"class": "roomPickColumn blue"}).get_text("|")
    picks = picks.split("|")
    bans = [x['alt'] for x in soup.find(attrs={"class": "roomBanRow blue"}).find_all('img')]
    return picks,bans, name

In [74]:
def getRed_picks_bans(soup,name):
    picks = soup.find(attrs={"class": "roomPickColumn red"}).get_text("|")
    picks = picks.split("|")
    bans = [x['alt'] for x in soup.find(attrs={"class": "roomBanRow red"}).find_all('img')]
    return picks, bans, name

In [75]:
def detect_team_in_name(name,team_reference_dict, side) :

    for tag, keywords in team_reference_dict.items():
        for  word in keywords :
            if word in name.lower()  :
                return tag
    return name

In [76]:
def extract_date(name):

    match = re.search(r'\d', name)
    if match :
        digit_index = match.start()
        return name[digit_index:]
    return "NaT"

In [77]:
def get_side_by_tag(soup,team_reference_dict):

    blue_text = unicodedata.normalize("NFKD",soup.find(attrs={"class" : "roomReadyBackground roomReadyBackgroundblue"}).previous_sibling.get_text())
    red_text = unicodedata.normalize("NFKD",soup.find(attrs={"class" : "roomReadyBackground roomReadyBackgroundred"}).previous_sibling.get_text())
    
    blue_team = detect_team_in_name(blue_text, team_reference_dict, "Blue")
    red_team = detect_team_in_name(red_text,team_reference_dict, "Red")

    if blue_team == "SCL" :
        game_date = extract_date(blue_text)
    elif red_team == "SCL" :
        game_date = extract_date(red_text)
    else :
        game_date = "NaT"
    return blue_team, red_team, game_date

In [102]:
def scraping_draft(draft_url,team_reference_dict):

    driver.get(draft_url)
    time.sleep(5)

    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    teams_names = get_side_by_tag(soup,team_reference_dict)

    blue = getBlue_picks_bans(soup,teams_names[0])
    red = getRed_picks_bans(soup,teams_names[1])

    draft_json = {
        "link" : draft_url,
        "date" : teams_names[2],
        "blue" : 
            {
                "picks" : blue[0],
                "bans"  : blue[1],
                "team"  : blue[2]
            },
        "red" :
            {
                "picks" : red[0],
                "bans"  : red[1],
                "team"  : red[2]
            }
        }

    return draft_json

In [122]:
def document_exist(draft_url):
    client = MongoClient()
    draft_collection = client['lol_match_database']['drafts']
    document = draft_collection.find_one({"link" : draft_url})
    if document !=None :
        return True
    return False

## Main
Get data from gsheets and check if the document already exist in the database

In [None]:
load_dotenv()
gc = gspread.service_account(filename=os.getenv("GOOGLE_CREDENTIALS_PATH"))

sh = gc.open_by_key(os.getenv("SPREADSHEET_KEY"))

list_draft_url = list(chain.from_iterable(sh.worksheet("Historique de Scrim").get("K2:M")))

In [None]:
#zip the list with URL and a list containing the result of DOCUMENT_EXIST function for that list
for url, exists in zip(list_draft_url, list(map(document_exist,list_draft_url))) :
    if not exists :
        print(scraping_draft(draft_url=url,team_reference_dict=team_tags_dico))

In [None]:
#TODO push into the database