In [1]:
# import all necessary packages for webscraping a list of Bundesliga first devision teams

import time
import requests
from bs4 import BeautifulSoup
import re # support regular expressions
import pandas as pd
import numpy as np

In [2]:
# act as a browser to avoid the hp suspecting any scraping

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

In [14]:
# get content of the page that lists all current Bundeliga teams on tm.de

page = "https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1"
page_tree = requests.get(page, headers=headers)
html_soup = BeautifulSoup(page_tree.content, 'html.parser')

In [4]:
# extracting table from the soup object
club_table = html_soup.find(id='yw1')

In [5]:
# finding all elements that contain the club name
club_name_elements = club_table.find_all('td', {'class': 'hauptlink no-border-links'})
club_names = [club.text.replace(u'\xa0', u'').strip() for club in club_name_elements]
club_names

['FC Bayern München',
 'Borussia Dortmund',
 'RasenBallsport Leipzig',
 'Bayer 04 Leverkusen',
 'Eintracht Frankfurt',
 'Borussia Mönchengladbach',
 'VfL Wolfsburg',
 'TSG 1899 Hoffenheim',
 'SC Freiburg',
 'VfB Stuttgart',
 '1.FC Union Berlin',
 '1.FSV Mainz 05',
 'FC Augsburg',
 'Hertha BSC',
 '1.FC Köln',
 'SV Werder Bremen',
 'FC Schalke 04',
 'VfL Bochum']

In [6]:
# finding all elements with team sizes and store values in list

team_size_elements = club_table.find_all('td', {'class': 'zentriert'})
team_sizes = [size.text for size in team_size_elements][4::4]
team_sizes

['27',
 '33',
 '27',
 '28',
 '30',
 '27',
 '25',
 '31',
 '26',
 '29',
 '27',
 '27',
 '32',
 '28',
 '35',
 '26',
 '32',
 '28']

In [7]:
# finding all elements that contain the average age of a team and put it
avg_team_age_elements = club_table.find_all('td', {'class': 'zentriert'})
avg_team_ages = [age.text for age in avg_team_age_elements][5::4]
avg_team_ages


['26,2',
 '24,5',
 '25,3',
 '25,0',
 '25,1',
 '25,6',
 '24,4',
 '25,5',
 '26,0',
 '23,6',
 '27,3',
 '24,9',
 '25,1',
 '26,1',
 '24,3',
 '25,1',
 '27,1',
 '27,9']

In [8]:
# finding a teams average value and store in a list

avg_team_value_elements = club_table.find_all('td', {'class': 'rechts'})
avg_team_values = [float(value.text.split()[0].replace(',' , '.')) for value in avg_team_value_elements][2::2]
avg_team_values

[36.88,
 15.84,
 18.27,
 14.87,
 8.93,
 8.63,
 7.53,
 5.51,
 6.22,
 4.25,
 4.37,
 4.19,
 3.47,
 3.47,
 2.74,
 2.81,
 1.89,
 1.67]

In [9]:
# finding each teams total value and put in a list
avg_team_value_elements = club_table.find_all('td', {'class': 'rechts'})
team_values = [float(value.text.split()[0].replace(',' , '.')) for value in avg_team_value_elements][3::2]
team_values

[995.7,
 522.7,
 493.3,
 416.35,
 267.95,
 233.0,
 188.2,
 170.95,
 161.63,
 123.35,
 117.9,
 113.2,
 111.15,
 97.2,
 96.03,
 73.05,
 60.45,
 46.63]

In [10]:
# creating a pandas data frame of the tm.de table

# df = pd.DataFrame({'clubs': 'club_names', 'squad': 'team_sizes', 'avg_age': 'avg_team_ages', 'avg_market_value': 'avg_team_values', 'market_value': 'team_values'}, index=range(len(club_names)))
df = pd.DataFrame(list(zip(club_names, team_sizes)),columns=['clubs', 'squad'])
df

Unnamed: 0,clubs,squad
0,FC Bayern München,27
1,Borussia Dortmund,33
2,RasenBallsport Leipzig,27
3,Bayer 04 Leverkusen,28
4,Eintracht Frankfurt,30
5,Borussia Mönchengladbach,27
6,VfL Wolfsburg,25
7,TSG 1899 Hoffenheim,31
8,SC Freiburg,26
9,VfB Stuttgart,29


In [11]:
# add the remaining lists as columns

df['avg_age'] = pd.Series(avg_team_ages)
df

Unnamed: 0,clubs,squad,avg_age
0,FC Bayern München,27,262
1,Borussia Dortmund,33,245
2,RasenBallsport Leipzig,27,253
3,Bayer 04 Leverkusen,28,250
4,Eintracht Frankfurt,30,251
5,Borussia Mönchengladbach,27,256
6,VfL Wolfsburg,25,244
7,TSG 1899 Hoffenheim,31,255
8,SC Freiburg,26,260
9,VfB Stuttgart,29,236


In [12]:
df['avg_market_value'] = pd.Series(avg_team_values)
df['market_value'] = pd.Series(team_values)
df


Unnamed: 0,clubs,squad,avg_age,avg_market_value,market_value
0,FC Bayern München,27,262,36.88,995.7
1,Borussia Dortmund,33,245,15.84,522.7
2,RasenBallsport Leipzig,27,253,18.27,493.3
3,Bayer 04 Leverkusen,28,250,14.87,416.35
4,Eintracht Frankfurt,30,251,8.93,267.95
5,Borussia Mönchengladbach,27,256,8.63,233.0
6,VfL Wolfsburg,25,244,7.53,188.2
7,TSG 1899 Hoffenheim,31,255,5.51,170.95
8,SC Freiburg,26,260,6.22,161.63
9,VfB Stuttgart,29,236,4.25,123.35


In [16]:
# adding country column and popuating country code 'GER'

df['country'] = pd.Series('GER' for club in club_names)
df

Unnamed: 0,clubs,squad,avg_age,avg_market_value,market_value,country
0,FC Bayern München,27,262,36.88,995.7,GER
1,Borussia Dortmund,33,245,15.84,522.7,GER
2,RasenBallsport Leipzig,27,253,18.27,493.3,GER
3,Bayer 04 Leverkusen,28,250,14.87,416.35,GER
4,Eintracht Frankfurt,30,251,8.93,267.95,GER
5,Borussia Mönchengladbach,27,256,8.63,233.0,GER
6,VfL Wolfsburg,25,244,7.53,188.2,GER
7,TSG 1899 Hoffenheim,31,255,5.51,170.95,GER
8,SC Freiburg,26,260,6.22,161.63,GER
9,VfB Stuttgart,29,236,4.25,123.35,GER
