In [1]:
# importing necessary packages
import pandas
import numpy
import re
import sys
!{sys.executable} -m pip install bs4
!{sys.executable} -m pip install lxml
from bs4 import BeautifulSoup
from datetime import datetime



In [2]:
# Creating an array with names of all the files that need to be scraped. I will loop through these to scrape from them.
file_names = ['Arsenal', 'Aston Villa', 'BHA', 'Bournemouth', 'Burnley', 'Chelsea', \
              'Crystal Palace', 'Everton', 'Leicester', 'Liverpool','MCFC','MUFC', \
             'Newcastle', 'Norwich', 'SheffUtd', 'Southampton', 'Tottenham', 'Watford', \
             'West Ham', 'Wolves']
files= []
for x in file_names:
    file = (x + '.html')
    files.append(file)

In [3]:
# Looping through each file and using BeautifulSoup to scrape player team, name, height, weight, position, and birthdate.
# I add those scraped parameters to their respective arrays

names = []
positions = []
heights = []
weights = []
birthdates = []
teams = []

for file in files:
    with open(file, encoding = 'utf-8') as file_reader:
        soup = BeautifulSoup(file_reader, "lxml")
        players1 = soup.find_all('tr', class_ = 'shsRow0Row')
        players2 = soup.find_all(class_ = 'shsRow1Row')
        players = players1 + players2
        
        for player in players:
            name = player.find(class_ = "shsNamD").getText()
            height = player.find(class_ = "shsPlayerFeet").getText()
            weight = player.find(class_ = "shsPlayerPounds").getText()
            position = player.find(class_ = "shsNamD shsRostPos shsHideCol").getText()
            birthdate = player.find_all(class_ = "shsNumD")[3].getText()
            team = file[:-5]
            
            names.append(name)
            teams.append(team)
            positions.append(position)
            heights.append(height)
            weights.append(weight)
            birthdates.append(birthdate)

In [4]:
#Building the dataframe with the arrays of player data.
EPLPlayerInfo = pandas.DataFrame({ 'Team' : teams,
                                  'Name' : names,
                                  'Position' : positions,
                                  'Height' : heights,
                                  'Weight' : weights,
                                  'Birthdate' : birthdates})

In [5]:
# stripping whitespace off of birthdate value in order to convert to datetime object and calculate age
for x in range(0, len(EPLPlayerInfo['Birthdate'])):
    EPLPlayerInfo['Birthdate'][x] = EPLPlayerInfo['Birthdate'][x].strip()

In [6]:
# replacing string dates with datetime object dates, dropping rows with blank birthdate cell
for x in range(0, len(EPLPlayerInfo['Birthdate'])):
    try:
        EPLPlayerInfo['Birthdate'][x] = datetime.strptime(EPLPlayerInfo['Birthdate'][x], "%m/%d/%Y")
    except:
        EPLPlayerInfo = EPLPlayerInfo.drop(index = x)

In [7]:
EPLPlayerInfo.reset_index(drop=True, inplace=True)

In [8]:
# calculating age from birthdate and adding to age array
current_date = datetime(2020, 5, 4)
ages = []
for x in range(0,len(EPLPlayerInfo['Birthdate'])):
    birthday =  EPLPlayerInfo['Birthdate'][x]
    age = (current_date.year - birthday.year)
    if(current_date.month < birthday.month): age = age - 1
    ages.append(age)

In [9]:
# adding age column to dataframe, dropping birthdate colum
EPLPlayerInfo['Age'] = ages
EPLPlayerInfo.drop(axis = 1, labels = 'Birthdate', inplace = True)

In [10]:
# dropping all rows which have empty cells, resetting index
EPLPlayerInfo.replace('', numpy.nan, inplace = True)
EPLPlayerInfo.dropna(how = 'any', inplace = True)
EPLPlayerInfo.reset_index(drop=True, inplace=True)

In [11]:
EPLPlayerInfo

Unnamed: 0,Team,Name,Position,Height,Weight,Age
0,Arsenal,Bernd Leno,Goalkeeper,6-3,183 lbs,28
1,Arsenal,Emiliano Martínez,Goalkeeper,6-5,194 lbs,27
2,Arsenal,Héctor Bellerín,Defender,5-10,163 lbs,25
3,Arsenal,Calum Chambers,Defender,6-0,146 lbs,25
4,Arsenal,Rob Holding,Defender,6-0,165 lbs,24
...,...,...,...,...,...,...
518,Wolves,Romain Saïss,Midfielder,6-3,168 lbs,30
519,Wolves,Adama Traoré,Midfielder,5-10,168 lbs,24
520,Wolves,Leonardo Campana,Forward,5-11,154 lbs,19
521,Wolves,Raúl Jiménez,Forward,6-2,174 lbs,29
