In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import bs4 as bs
from urllib.request import Request, urlopen
import pandas as pd
import time
import os
import requests
from IPython import embed
import re
import urllib.request

from datetime import datetime

website = 'https://www.smogon.com/stats/'

The purpose of the following two functions is to either download the html of a website page, or to open the website using selenium. In the case of websites which require dynamically generated content (without the url change. Javascript for example), selenium is required. In our case it is not required for Smogon, since every url change means a page change.

NOTE: The directory for "geckodriver" must be specified to the location of the file within the submission folder to allow the scraper to work correctly.

In [None]:
def fetch(page, addition=''):
    return bs.BeautifulSoup(urlopen(Request(page + addition,
            headers={'User-Agent': 'Opera/9.80 (X11; Linux i686; Ub'\
                     'untu/14.10) Presto/2.12.388 Version/12.16'})).read(), 'lxml')
def fetch_sel(page, headless = True):
	options = webdriver.FirefoxOptions()
	if headless:
		options.add_argument('-headless')
	direct = os.path.dirname(__file__)
	exe_path = os.path.join(direct, 'geckodriver')
	exe_path = '/Users/ajkea/Documents/geckodriver'
	driver = webdriver.Firefox(executable_path=exe_path,options=options)
	driver.get(page)
	
	html = driver.page_source
	return bs.BeautifulSoup(html,'lxml'), driver

The following function is the main scraping function. Here are the steps to get the data.

1. Create empty dataframes, and list of formats (used later)
2. The main scraping loop.
    1. Loop through every date/year where the information is available. We can then add it to the url and fetch the new url. For example, the base url is: https://www.smogon.com/stats/ and the url with dates is https://www.smogon.com/stats/2014-11/.
    2. For each of the months, we then loop through every available file and find the last available for each of the leagues. This is done quite inneficiently since we need to loop through the list of available files many times to check if there are more than one file for a certain league.
    3. Once we know which files to gather, we can use pandas to read the csv (txt file) with pd.read_csv(https://www.smogon.com/stats/2014-11/randombattle-1500.txt) for example.
    4. The information is then added to the full pandas dataframe.

In [None]:
def get_smogon_stats(website):
	''' Gets all the monthly txt files'''
	df_metadata = pd.DataFrame(columns=['txt_file','tot_battles','avg_weight_team'])
	df_poke_data = pd.DataFrame(columns=['file_index','pokemon','usage_perc','raw','perc','real2','perc2'])
	
	formats = ["ubers-","ou-","uu-","ru-","pu-","vgc20"]

	result_page = fetch(f'{website}')

	yr_dates = result_page.find('pre')
	for yr_dt in yr_dates.find_all('a'):
		date_href = yr_dt['href']
		print(date_href)
		if len(date_href.split('-')) == 2:
			formats_done = []
			result_page_date = fetch(f'{website}{date_href}').find('pre')
			all_txt_files = [x['href'] for x in result_page_date.find_all('a')]
			last_gen_val = 0
			for file in all_txt_files:
				if file[:3] == 'gen':
					last_gen = file[3]
					if last_gen!= 'e' and last_gen.isnumeric() and int(last_gen) > last_gen_val:
						last_gen_val = int(last_gen)
			for format_ in formats:
				gen_files = [x for x in all_txt_files if f"{format_}" in x]
				if len(gen_files) == 4:
					# Only available ones
					pass
				elif len(gen_files) < 4:
					print(f"1-Problem with {format_}, yr_dt:{date_href}, len:{len(gen_files)} \n\t\tFiles: {gen_files}")
				else:
					# print(gen_files)
					# print(f'gen{last_gen}')
					# Fil to be put in df
					gen_files2 = [x for x in gen_files if f'gen{last_gen_val}{format_}' in x]
					if format_ == 'vgc20' and len(gen_files2) != 4:
						gen_files2 = gen_files[-4:]
					if f"{format_}0.txt" in gen_files and len(gen_files2) != 4:
						gen_files2 = [x for x in gen_files if x[:2] == format_[:2]]
					if len(gen_files2) != 4 and len(gen_files) != 4:
						print(f"2-Problem with {format_}, yr_dt:{date_href}, len:{len(gen_files)} \n\t\tFiles: {gen_files}\n\t\tFiles: {gen_files2}\n\t\tgen{last_gen_val}{format_}")
						gen_files2 = gen_files[-4:]

				for txt_file_name in gen_files2:
					# print(txt_file_name)
					# file = urllib.request.urlopen(f"{website}{date_href}{txt_file_name}")
					filename = f"{website}{date_href}{txt_file_name}"
					file = requests.get(filename).text.splitlines()[:2]
					tot_battles = int(float(file[0].split(':')[-1].strip()))
					avg_weight_team = float(file[1].split(':')[-1].strip())

					df_metadata = df_metadata.append({'txt_file':filename,'tot_battles':tot_battles,
											'avg_weight_team':avg_weight_team}, ignore_index=True)

					df = pd.read_csv(filename, skiprows=[0,1,2,4], delimiter='|')
					df.drop(df.index[-1],inplace=True)
					remove_cols = list(df.columns[:2])+list(df.columns[-1:])
					df.drop(remove_cols,axis=1,inplace=True)
					df.columns = ['pokemon','usage_perc','raw','perc','real2','perc2']
					df['file_index'] = max(df_metadata.index)
					df_poke_data = df_poke_data.append(df, ignore_index=True)
	

	perc_cols = [x for x in df_poke_data.columns if 'perc' in x]
	for col in perc_cols:
		df_poke_data[col] = df_poke_data[col].apply(lambda x: float(x.replace('%',''))/100)

	df_metadata.to_csv('metadata.csv')
	df_poke_data.to_csv('pokemon_data.csv')

In [None]:
get_smogon_stats(website)