## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.offline as pyo
from IPython.display import Image
import requests, six
import lxml.html as lh
import re
from itertools import cycle, islice
from matplotlib import colors
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# MLB 2020 Stats Web Scraping

In [2]:
def web_scrap(url):
    #Create an identifier, a page, to handle the content of the website.
    page = requests.get(url)

    #Store website content in doc.
    doc = lh.fromstring(page.content)

    #Parse the data that is stored between <tr> .. </tr> of the site's HTML code
    tr_elements = doc.xpath('//tr')

    #Check the length of the first 12 rows
    row_lenght = [len(T) for T in tr_elements[:12]]
    tr_elements = doc.xpath('//tr')

    #Create empty list
    col=[]
    i=0

    #For each row, store each first item (header) and an empty list
    for t in tr_elements[0]:
        i+=1
        name=t.text_content()
        #print(name[:len(name)//2])
        col.append([name[:len(name)//2],[]])
    col[9][0]='RBI'
    #Since our first row is the header, the data is stored in the second row onwards
    for j in range(1,len(tr_elements)):
        #T es nuestra j'th fila
        T=tr_elements[j]

        #If the row is not size 10, the // tr data is not from our table
        if len(T)!=row_lenght[0]:
            break

        #i is the index of our column
        i=0

        #Iterate through each item in the row.
        for t in T.iterchildren():
            data=t.text_content() 
            #Check if row is empty
            if i>0:
            #Convert any numeric value to integer
                try:
                    data=int(data)
                except:
                    pass
            #Add the data to the empty list of the i-th column
            col[i][1].append(data)
            #Increment i for the next column
            i+=1
    Dict = {title:column for (title,column) in col}
    return Dict

In [3]:
url='https://www.mlb.com/stats/2020'
df=pd.DataFrame(web_scrap(url))
df

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS
0,1JoseJ AbreuAbreu1B1‌‌‌,CWS,60,240,43,76,15,0,19,60,18,59,0,0,0.317,0.37,0.617,0.987
1,2MarcellM OzunaOzunaDH2‌‌‌,ATL,60,228,38,77,14,0,18,56,38,60,0,0,0.338,0.431,0.636,1.067
2,3FreddieF FreemanFreeman1B3‌‌‌,ATL,60,214,51,73,23,1,13,53,45,37,2,0,0.341,0.462,0.64,1.102
3,4LukeL VoitVoit1B4‌‌‌,NYY,56,213,41,59,5,0,22,52,17,54,0,0,0.277,0.338,0.61,0.948
4,5MannyM MachadoMachado3B5‌‌‌,SD,60,224,44,68,12,1,16,47,26,37,6,3,0.304,0.37,0.58,0.95
5,6JoseJ RamírezRamirez3B6‌‌‌,CLE,58,219,45,64,16,1,17,46,31,43,10,3,0.292,0.386,0.607,0.993
6,6MikeM TroutTroutCF6‌‌‌,LAA,53,199,41,56,9,2,17,46,35,56,1,1,0.281,0.39,0.603,0.993
7,8FernandoF Tatis Jr.TatisSS8‌‌‌,SD,59,224,50,62,11,2,17,45,27,61,11,3,0.277,0.366,0.571,0.937
8,9RafaelR DeversDevers3B9‌‌‌,BOS,57,232,32,61,16,1,11,43,13,67,0,0,0.263,0.31,0.483,0.793
9,10CharlieC BlackmonBlackmonRF10‌‌‌,COL,59,221,31,67,12,1,6,42,19,44,2,1,0.303,0.356,0.448,0.804


In [None]:
for i in range(2,26):
    url2='https://www.mlb.com/stats/2020?page='+str(i)
    df2=pd.DataFrame(web_scrap(url2))
    frames = [df,df2]
    df = pd.concat(frames)

In [None]:
df

In [None]:
df = df.reset_index(drop=True)
df['PLAYER'] = df['PLAYER'].str.rstrip('âââ')
df.head()

In [None]:
#Get the positions of the players from the 'PLAYER' column
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]
posit=[]
for i in df['PLAYER']:
    m = unique(re.findall('[A-Z][^A-Z]*', i))
    if 'X' in m[-1]:
        posit.append('X')
    elif 'C' in m[-1]:
        posit.append('C')
    elif 'P' in m[-1]:
        posit.append('P')
    else:
        m = m[-2][-1]+m[-1][0]
        posit.append(m)
df.loc[:,'POSITION'] = posit
df

In [None]:
for i in df['PLAYER']:
    i = ''.join(filter(str.isalpha,i))
    
from string import printable
for i in range(len(df)):
    df['PLAYER'][i] = ''.join(filter(str.isalpha, df['PLAYER'][i]))
    m = unique(re.findall('[A-Z][^A-Z]*', df['PLAYER'][i]))
    set(m[2]).difference(printable)

    if set(m[2]).difference(printable):
        m.pop(2)
        df['PLAYER'][i] = str(m[0]+" "+m[2])
    else:
        df['PLAYER'][i] = str(m[0]+" "+m[2])
df

In [None]:
df.POSITION.value_counts()

In [None]:
df.info()

In [None]:
columns = ['AVG','OBP','SLG','OPS']
df[columns] = df[columns].apply(pd.to_numeric, errors='coerce', axis=1)
df.info()

In [None]:
#Saving the data in a new file in xlsx format. called: 'mlb_players_data.xlsx'
df.to_excel('mlb_players_data.xlsx',index=False)

## Stats Per Team

In [None]:
stats = df.drop(columns=['PLAYER','G'])
stats

In [None]:
stats.info()

In [None]:
stats['LEN'] =1
stats = stats.groupby(['TEAM']).sum()
stats.head()

In [None]:
stats.columns

In [None]:
stats[['AVG','OBP', 'SLG', 'OPS']] = stats[['AVG','OBP', 'SLG', 'OPS']].div(stats['LEN'],axis=0).round(3)

In [None]:
stats.drop(columns=['LEN'],inplace=True)
stats.reset_index(level=0, inplace=True)
stats

## Plotting Data

### Runs

In [None]:
fig = px.bar(stats, x='TEAM', y='R')
fig.update_layout(title_text='Runs Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Runs')
fig.update_traces(marker_color='green')
#fig.show()
Image(filename='Runs Per Team in Season 2020.png') 

### Hits

In [None]:
fig = px.bar(stats, x='TEAM', y='H')
fig.update_layout(title_text='Hits Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Hits')
fig.update_traces(marker_color='goldenrod')
#fig.show()
Image(filename='Hits Per Team in Season 2020.png')

### Doubles

In [None]:
fig = px.bar(stats, x='TEAM', y='2B')
fig.update_layout(title_text='Doubles Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Doubles')
fig.update_traces(marker_color='darkseagreen')
#fig.show()
Image(filename='Doubles Per Team in Season 2020.png')

### Triples

In [None]:
fig = px.bar(stats, x='TEAM', y='3B')
fig.update_layout(title_text='Triples Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Triples')
fig.update_traces(marker_color='rosybrown')
#fig.show()
Image(filename='Triples Per Team in Season 2020.png')

### Home Runs

In [None]:
fig = px.bar(stats, x='TEAM', y='H')
fig.update_layout(title_text='Home Runs Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Home Runs')
fig.update_traces(marker_color='peru')
#fig.show()
Image(filename='Home Runs Per Team in Season 2020.png')

### Runs Batted In

In [None]:
fig = px.bar(stats, x='TEAM', y='RBI')
fig.update_layout(title_text='Runs Batted In Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Runs Batted In')
fig.update_traces(marker_color='royalblue')
#fig.show()
Image(filename='Runs Batted In Per Team in Season 2020.png')

In [None]:
fig = px.bar(stats, x='TEAM', y='BB')
fig.update_layout(title_text='Walks In Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Walks')
fig.update_traces(marker_color='limegreen')
#fig.show()
Image(filename='Walks In Per Team in Season 2020.png')

### Strikeouts

In [None]:
fig = px.bar(stats, x='TEAM', y='SO')
fig.update_layout(template='plotly_dark',title_text='Strikeouts In Per Team in Season 2020', title_x=0.5)
fig.update_xaxes(title='Teams',tickangle=-45)
fig.update_yaxes(title='Strikeouts')
#fig.show()
Image(filename='Strikeouts In Per Team in Season 2020.png')

### Radar Chart for percentage statistics

In [None]:
list_stats = stats.values.tolist()
statistics = dict()
for i in range(len(list_stats)):
    lista=[]
    for j in range(2,len(list_stats[0])):
        lista.append(list_stats[i][j])
    statistics[list_stats[i][0]] = lista

In [None]:
categories = list(stats)[12:]
categories = np.concatenate((categories, [categories[0]]))

metrics={}
for i in stats['TEAM']:
    metrics['{0}'.format(i)] = np.concatenate((statistics[i][10::], [statistics[i][10]]))

In [None]:
metrics

In [None]:
#ax.set_ylim(0.9, 1)
data=[]
for i in metrics:
    data.append(go.Scatterpolar(r=metrics[i], theta=categories, name=i))
    
fig = go.Figure(
    data=data,
    layout=go.Layout(
        template='plotly_dark',
        title=go.layout.Title(text='AVG-OBP-SLG-OPS per Team'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

#pyo.plot(fig)
Image(filename='AVG-OBP-SLG-OPS per Team.png') 

In [None]:
#Saving the data in a new file in xlsx format. called: 'mlb_teams_data.xlsx'
stats.to_excel('mlb_teams_data.xlsx',index=False)