In [3]:
#Installing packages onto the device
#!{sys.executable} -m pip install -U selenium

import sys

#Define the folder with the python scripts for web scraping in order to import these scripts
sys.path.insert(0, 'C:\\Users\\AU451FE\\OneDrive - EY\\Desktop\\Python\\HSreplay_scraper\\Scripts')
from Analyzer import DeckAnalyzer as DA
from Selector import DeckSelector as DS
from Extractor import ArchetypeExtractor as AE

#External browser Selenium
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


#Other useful packages
import sys
from bs4 import BeautifulSoup
import requests
import time
from datetime import date
import datetime
import pandas as pd
import numpy as np
import re #String search
import warnings

#Silence the deprecation warning when minimizing the external drivers
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [4]:
driver_path = 'C:/Users/AU451FE/OneDrive - EY/Desktop/Python/HSreplay_scraper/chromedriver'

In [90]:
class UltimateAnalyzer:
    '''Return data on all or some decks from the hsreplay website as a data frame
    '''
    def __init__(self, driver_path, minimized = True):
        self.driver_path = driver_path
        self.minimized = minimized

    def open_driver(self):
        '''Open an empty driver with the specified driver path
        '''
        if self.minimized == True:
            options = webdriver.ChromeOptions()
            options.set_headless(True) 
            self.driver = webdriver.Chrome(self.driver_path,options=options) 
        else:
            self.driver = webdriver.Chrome(self.driver_path)
            
        return None
    
    def open_website(self):
        '''Put in the information you wish to extract and open a website with a website containing said information
        '''
        self.open_driver()
        self.driver.get(f'https://hsreplay.net/decks')

        self.driver.maximize_window()

        try:
            WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_class_name('css-flk0bs'))
            self.driver.find_element_by_class_name('css-flk0bs').click()
        except TimeoutException:
            raise Exception('The privacy window has not shown up; try running the script again')

        return None
    
    def get_card_info(self):
        '''Analyze the mulligan guide page of the deck and store this information in a data frame
        
        The method assumes you already have the mulligan guide window open
        '''
        #Generating the card names data
        card_names = self.driver.find_elements_by_class_name('table-row-header')
        cards = []
        for c in card_names:
            info = c.text
            txt = info.rsplit('\n')
            if len(txt) == 3:
                mana_cost = int(txt[0])
                card_name = txt[2]
                card_count = int(txt[1].replace('★', '1'))

                row = [mana_cost, card_name, card_count]
                cards.append(row)
            elif len(txt) == 2:
                mana_cost = int(txt[0])
                card_name = txt[1]
                card_count = 1
                
                row = [mana_cost, card_name, card_count]
                cards.append(row)
            else:
                raise Exception('Error - the scraper is not reading the card information properly')

        #Generating the card details data
        data = self.driver.find_elements_by_class_name('table-cell')
        further_info = []
        for d in range(int(len(data)/6)):
            try:
                mull_wr = data[0+6*d].text.replace('▼', '').replace('▲', '')
                per_kept = data[1+6*d].text
                drawn_wr = data[2+6*d].text.replace('▼', '').replace('▲', '')
                played_wr = data[3+6*d].text.replace('▼', '').replace('▲', '')
                turns_held = float(data[4+6*d].text)
                turns_played = float(data[5+6*d].text)
            
                row = [mull_wr, per_kept, drawn_wr, played_wr, turns_held, turns_played]
            except ValueError:
                print('Some cards in this deck contain missing data')
                row = []
                
            further_info.append(row)

        #Concatenating the two data frames together    
        df_card = pd.DataFrame(cards, columns = ['Mana Cost', 'Card Name', 'Card Count'])
        df_further = pd.DataFrame(further_info, columns = ['Mulligan WR', 'Kept', 'Drawn WR', 
                                                           'Played WR', 'Turns Held', 'Turn Played'])
        
        df = pd.concat([df_card, df_further], axis = 1)
        
        return df

        
    def get_overview(self):
        '''Analyze the mulligan guide page of the deck and store this information in a data frame
        
        The method assumes you already have the overview window open
        '''
        data = self.driver.find_elements_by_xpath("//tr/td[2]")
        url = self.driver.current_url
        code = re.search('decks/(.+?)/#tab', url).group(1)
        
        overview = []
        overview.append(code)
        for d in data:
            text = d.text.replace('▼', '').replace('▲', '')
            overview.append(text)
        
        #Add sample size manually
        sample_size = int(self.driver.find_element_by_xpath("//*[@id='deck-container']/div/aside/section/ul/li[1]/span").text.replace(' games', '').replace(',',''))
        overview.append(sample_size)
        
        overview = [overview]
        
        df = pd.DataFrame(overview, columns = ['Deck Code', 'Match Duration', 'Turns', 'Turn Duration', 'Overall Winrate',
                                               'vs. Demon Hunter', 'vs. Druid', 'vs. Hunter',
                                               'vs. Mage', 'vs. Paladin', 'vs. Priest', 'vs. Rogue',
                                               'vs. Shaman', 'vs. Warlock', 'vs. Warrior', 'Sample Size'])

        return df

    
    def get_archetype_data(self, class_name, archetype_name):
        '''Specify the name for the archetype and return the data from the hsreplay website for t given archetype
        
        :args:
        - class_name = name of the class (use format: 'Demon Hunter', 'Druid', etc.)
        - archetype_name = name of the archetype (use format: 'Midrange Hunter', 'Face Hunter', etc.)
        
        :returns:
        - data_frames - a data frame containing data for the given archetype
        
        :usage:
            self.driver.get_archetype_data('Mage', 'No Minion Mage')
        '''
        #Pre-processing and identifying the data        
        class_codes = {'Demon Hunter' : 1, 'Druid' : 2, 'Hunter' : 3, 'Mage' : 4, 'Paladin' : 5,
                       'Priest' : 6, 'Rogue' : 7 , 'Shaman' : 8, 'Warlock' : 9, 'Warrior' : 10}
       
        class_index = class_codes.get(class_name)
        
        if class_index == None:
            raise Exception('The class name is not correctly specified (e.g. Demon Hunter, Warlock, etc.)')
        else:
            pass
            
        
        #The actual process
        self.open_website()
        
        #Open the page for the specified archetype
        xpath_class = f'//*[@id="player-class-filter"]/div/div[1]/span[{class_index}]/div/img'
        x = self.driver.find_element_by_xpath(xpath_class)
        x.click()
        
        
        xpath_archetype = f'//*[@id="player-class-filter"]/div/div[2]/div/ul/li/span[text() = "{archetype_name}"]'
        y = U.driver.find_element_by_xpath(xpath_archetype)
        y.click()
        
        deck_amount = len(self.driver.find_elements_by_xpath('//*[@id="decks-container"]/main/div[3]/section/ul/li/a'))
                
        #Generate the card info for each of the decks of a given archetype
        data_frames = []
        
        overviews_df = pd.DataFrame()
        
        for d in range(deck_amount):
            u = WebDriverWait(self.driver, 8)
            u.until(EC.presence_of_element_located((By.CLASS_NAME,"deck-tile")))

            index = d + 2
            xpath_deck = f'//*[@id="decks-container"]/main/div[3]/section/ul/li[{index}]/a'
            l = self.driver.find_element_by_xpath(xpath_deck)
            l.click()

            u.until(EC.presence_of_element_located((By.CLASS_NAME,"sort-header__title")))   

            card_info = self.get_card_info()
            data_frames.append(card_info)

            #Switch to overview
            overview_button = self.driver.find_element_by_id('tab-overview')
            overview_button.click()

            u.until(EC.presence_of_element_located((By.CLASS_NAME,"mana-curve")))

            overview = self.get_overview()
            overviews_df.append(overview)

            deck_position = d + 1
            print(f'Generated data for {deck_position}/{deck_amount} decks of archetype {archetype_name}')
            print(card_info)
            self.driver.back()
        
        data_frames.insert(0, overviews_df)  
        self.driver.quit()
        
        return data_frames
    
        
    def archetype_to_excel(self, class_name, arch_name):
        df = self.get_archetype_data(class_name, arch_name)
        
        #Get the number of data frames to write into excel
        sheet_n = len(df)    

        #Write these data frames into excel
        today = date.today().strftime("%m-%d")
        with pd.ExcelWriter(f'C:/Users/AU451FE/OneDrive - EY/Desktop/Python/HSreplay_Scraper/Data Frames/{arch_name} {today}.xlsx') as writer:
            for i in range(sheet_n):
                df[i].to_excel(writer, sheet_name = f'{i}', index = False)
        
        return df
    
    def get_all_data(self):
        '''Return all the data from the hsreplay website as several data frames
        '''
        self.open_website()
        
        #Get the classes as a list of the html elements
        classes_len = len(self.driver.find_elements_by_xpath('//*[@id="player-class-filter"]/div/div[1]/span/div/img'))
        for c in range(classes_len):
            index = c + 1
            xpath_class = f'//*[@id="player-class-filter"]/div/div[1]/span[{index}]/div/img'
            c = self.driver.find_element_by_xpath(xpath_class)
            
            class_name = c.get_attribute('alt').lower()
            c.click()   #Go to the website of the class

            
            archetype_length = len(self.driver.find_elements_by_xpath('//*[@id="player-class-filter"]/div/div[2]/div/ul/li/span'))
            for a in range(archetype_length):
                index = a + 1
                xpath_arch = f'//*[@id="player-class-filter"]/div/div[2]/div/ul/li[{index}]/span'
                k = self.driver.find_element_by_xpath(xpath_arch)
                k.click()
                
                data_frames = []
                arch_name = k.text
                
                url = U.driver.current_url
                arch_code = re.search('archetypes=(.+)', url).group(1)

                overviews_df = pd.DataFrame()

                deck_amount = len(self.driver.find_elements_by_xpath('//*[@id="decks-container"]/main/div[3]/section/ul/li/a'))
                
                #Generate the card info for each of the decks of a given archetype
                for d in range(deck_amount):
                    u = WebDriverWait(self.driver, 8)
                    u.until(EC.presence_of_element_located((By.CLASS_NAME,"deck-tile")))
                
                    index = d + 2
                    xpath_deck = f'//*[@id="decks-container"]/main/div[3]/section/ul/li[{index}]/a'
                    l = self.driver.find_element_by_xpath(xpath_deck)
                    l.click()
                           
                    u.until(EC.presence_of_element_located((By.CLASS_NAME,"sort-header__title")))   
                        
                    card_info = self.get_card_info()
                    data_frames.append(card_info)

                    #Switch to overview
                    overview_button = self.driver.find_element_by_id('tab-overview')
                    overview_button.click()

                    u.until(EC.presence_of_element_located((By.CLASS_NAME,"mana-curve")))
                    
                    overview = self.get_overview()
                    overviews_df.append(overview)

                    deck_position = d + 1
                    print(f'Generated data for {deck_position}/{deck_amount} decks of archetype {arch_name}')
                    print(card_info)
                    self.driver.back()
                    

                u = WebDriverWait(self.driver, 8)
                u.until(EC.presence_of_element_located((By.CLASS_NAME,"deck-tile")))
                
                k = self.driver.find_element_by_xpath(xpath_arch)
                k.click()

                #Add the overview data frame to the beginning of the list
                data_frames.insert(0, overviews_df)
                
                #Get the number of data frames to write into excel
                sheet_n = len(data_frames)    

                #Write these data frames into excel
                today = date.today().strftime("%m-%d")
                with pd.ExcelWriter(f'C:/Users/AU451FE/OneDrive - EY/Desktop/Python/HSreplay_Scraper/Data Frames/{arch_name} {today}.xlsx') as writer:
                    for i in range(sheet_n):
                        data_frames[i].to_excel(writer, sheet_name = f'{i}', index = False)
        
        self.driver.quit()
        
        return data_frames

        
        

In [89]:
U = UltimateAnalyzer(driver_path, minimized = False)
data = U.archetype_to_excel('Mage', 'No Minion Mage')

Generated data for 1/18 decks of archetype No Minion Mage
    Mana Cost                Card Name  Card Count Mulligan WR   Kept  \
0           1             Brain Freeze           2       54.6%  37.5%   
1           1       Devolving Missiles           2       53.8%  30.8%   
2           1            Font of Power           2       59.6%  81.9%   
3           1       Primordial Studies           2       56.5%  77.0%   
4           1            Shooting Star           1       54.1%  31.0%   
5           2             Cram Session           2       53.1%  18.1%   
6           2          Incanter's Flow           2       66.6%  99.7%   
7           2                Runed Orb           2       54.3%  61.6%   
8           3         Arcane Intellect           2       53.2%  28.5%   
9           3               Combustion           2       53.7%  28.7%   
10          4                 Fireball           2       52.1%   3.0%   
11          4                Ring Toss           2       53.5%   7

Some cards in this deck contain missing data
Generated data for 5/18 decks of archetype No Minion Mage
    Mana Cost                Card Name  Card Count Mulligan WR   Kept  \
0           1             Brain Freeze           2       49.8%  43.0%   
1           1       Devolving Missiles           2       49.9%  41.0%   
2           1            Font of Power           2       54.0%  79.9%   
3           1       Primordial Studies           2       50.0%  75.8%   
4           2             Cram Session           2       51.9%  20.7%   
5           2          Incanter's Flow           2       62.9%  99.6%   
6           2                Runed Orb           2       51.4%  66.2%   
7           3         Arcane Intellect           2       51.1%  33.4%   
8           3               Combustion           2       49.3%  38.7%   
9           4                 Fireball           2       51.0%   3.0%   
10          4                Ring Toss           2       50.3%   7.9%   
11          5        

Generated data for 9/18 decks of archetype No Minion Mage
    Mana Cost                Card Name  Card Count Mulligan WR   Kept  \
0           1             Brain Freeze           2       56.9%  36.2%   
1           1       Devolving Missiles           2       50.1%  36.1%   
2           1            Font of Power           2       59.1%  84.0%   
3           1       Primordial Studies           2       54.2%  77.7%   
4           2             Cram Session           2       51.6%  22.6%   
5           2          Incanter's Flow           2       66.1%  99.8%   
6           2                Runed Orb           2       53.9%  67.3%   
7           3         Arcane Intellect           2       55.4%  40.4%   
8           3               Combustion           2       50.7%  31.4%   
9           3             Cone of Cold           1       49.8%  22.0%   
10          4                 Fireball           2       48.9%   3.0%   
11          4                Ring Toss           2       54.6%   8

Generated data for 13/18 decks of archetype No Minion Mage
    Mana Cost                Card Name  Card Count Mulligan WR   Kept  \
0           1             Brain Freeze           2       54.3%  56.4%   
1           1       Devolving Missiles           2       52.7%  44.4%   
2           1            Font of Power           2       56.6%  88.0%   
3           1       Primordial Studies           2       55.0%  83.4%   
4           2             Cram Session           2       50.0%  29.8%   
5           2          Incanter's Flow           2       62.3%  94.3%   
6           2                Runed Orb           2       52.2%  78.6%   
7           3         Arcane Intellect           2       57.1%  35.1%   
8           3        Netherwind Portal           2       55.6%  37.0%   
9           4           Deck of Lunacy           1       58.7%  74.9%   
10          4                 Fireball           2       49.5%   9.6%   
11          4                Ring Toss           2       51.8%  1

Generated data for 17/18 decks of archetype No Minion Mage
    Mana Cost                Card Name  Card Count Mulligan WR   Kept  \
0           1             Brain Freeze           2       50.5%  36.5%   
1           1       Devolving Missiles           2       55.4%  36.2%   
2           1            Font of Power           2       53.8%  78.2%   
3           1       Primordial Studies           2       57.1%  74.7%   
4           2             Cram Session           2       54.9%  19.7%   
5           2          Incanter's Flow           2       64.6%  99.7%   
6           2                Runed Orb           2       51.1%  53.3%   
7           3         Arcane Intellect           2       53.2%  31.9%   
8           3               Combustion           2       53.8%  33.1%   
9           3              Ice Barrier           1       48.5%  13.5%   
10          4                 Fireball           2       44.5%   4.8%   
11          4                Ring Toss           2       50.0%   

In [36]:

dir(U.driver)


['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_file_detector',
 '_is_remote',
 '_mobile',
 '_switch_to',
 '_unwrap_value',
 '_web_element_cls',
 '_wrap_value',
 'add_cookie',
 'application_cache',
 'back',
 'capabilities',
 'close',
 'command_executor',
 'create_options',
 'create_web_element',
 'current_url',
 'current_window_handle',
 'delete_all_cookies',
 'delete_cookie',
 'desired_capabilities',
 'error_handler',
 'execute',
 'execute_async_script',
 'execute_cdp_cmd',
 'execute_script',
 'file_detector',
 'file_detector_context',
 'find_element',
 'find_element_by_class_name',
 'find_element_by_css_selector',
 'find_element_by_id',
 

In [51]:
U.driver.quit()

In [12]:
U = UltimateExtractor(driver_path, minimized = False)
U.open_website()