In [1]:
from requests import get
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import pandas as pd
import numpy as np
import pickle 


In [2]:
club_power_index = get('https://projects.fivethirtyeight.com/global-club-soccer-rankings/', 'lxml')


''' Grabbing the country names, scores and ranks '''

club_power_index = BeautifulSoup(club_power_index.text, 'html.parser')


team_list=[p.text for p in club_power_index.findAll('div', attrs={'class':'name'})] 
score_list=[p.text for p in club_power_index.findAll('td', attrs={'class':'num'}) if len(p.text)>3] 
league_list=[p.text.strip() for p in club_power_index.findAll('td', attrs={'class':'league drop-5'}) ] 
country_list=[p.text.strip() for p in club_power_index.findAll('td', attrs={'class':'country drop-1'}) ] 


In [3]:
team_list=team_list[:600]
score_list=score_list[:600]
league_list=league_list[:600]
country_list=country_list[:600]


In [4]:
club_score_df=pd.DataFrame(list(zip(score_list, league_list, country_list)),
             index=team_list, columns=["score", "league", "country"])
club_score_df.head(30)

Unnamed: 0,score,league,country
Liverpool,94.1,Premier League,England
Man. City,93.9,Premier League,England
Bayern Munich,93.4,Bundesliga,Germany
PSG,90.5,Ligue 1,France
Real Madrid,90.1,La Liga,Spain
Barcelona,88.8,La Liga,Spain
Atlético Madrid,86.7,La Liga,Spain
RB Leipzig,86.3,Bundesliga,Germany
Chelsea,85.0,Premier League,England
Juventus,84.5,Serie A,Italy


In [5]:
# keep only teams in 7 major european leagues 
# we find that Austrian top soccer league is also named Bundasliga and Russian's league named 
# Premier League. To avoid ambiguity, we use country as a selection criteria
mask1=club_score_df['league'].isin(
    ["Premier League", "Serie A", "La Liga", "Bundesliga", "Ligue 1","Primeira Liga","Eredivisie"])

club_score_df=club_score_df[mask1]

mask2=club_score_df['country'].isin(['England', 'Germany', 'France', 'Spain', 'Italy', 'Netherlands',
       'Portugal'])

club_score_df=club_score_df[mask2]



In [6]:
club_score_df.reset_index(level=0, inplace=True)
club_score_df.rename(columns={'index':'club'}, inplace=True)

In [7]:
club_score_df

Unnamed: 0,club,score,league,country
0,Liverpool,94.1,Premier League,England
1,Man. City,93.9,Premier League,England
2,Bayern Munich,93.4,Bundesliga,Germany
3,PSG,90.5,Ligue 1,France
4,Real Madrid,90.1,La Liga,Spain
...,...,...,...,...
129,Twente,34.0,Eredivisie,Netherlands
130,Emmen,32.0,Eredivisie,Netherlands
131,RKC,30.8,Eredivisie,Netherlands
132,Fortuna Sittard,28.2,Eredivisie,Netherlands


In [8]:
def team_name_unify(team):
    if team=='Man. City':
        return 'Manchester City'
    if team=='Man. United':
        return 'Manchester United'
    if team=='Tottenham':
        return 'Tottenham Hotspur'
    if team== 'West Ham':
        return 'West Ham United'
    if team=='Leicester':
        return 'Leicester City'
    if team=='Newcastle':
        return 'Newcastle United'
    if team =='PSG':
        return 'Paris Saint-Germain'
    if team == 'Dortmund':
        return 'Borussia Dortmund'
#     if team =='Hoffenheim':
#         return 'TSG 1899 Hoffenheim'
    if team =='Eintracht':
        return 'Eintracht Frankfurt'
#     if team == 'Leverkusen':
#         return 'Bayer 04 Leverkusen'
    if team == 'Norwich':
        return 'Norwich City'
    if team =='Sheffield Utd':
        return 'Sheffield United'
    if team == 'Brighton':
        return 'Brighton & Hovelbion'
    if team =='Bordeaux':
        return 'Girondins Bordeaux'
    if team == 'St Étienne':
        return 'Saint-Étienne'
    if team == 'Twente':
        return 'Twente Enschede'
    else:
        return team

club_score_df.club=club_score_df.club.apply(team_name_unify)



In [9]:
club_score_df['score']=pd.to_numeric(club_score_df['score'])

In [10]:
pickle.dump(club_score_df[['club','score']], open('./temporary_pkl/club_score_df.pkl', 'wb'))

In [12]:
avrg_sc_by_league_dict = club_score_df.groupby(
    by='league')['score'].min().apply(lambda x:x-10).round(3).to_dict()
avrg_sc_by_league_dict

{'Bundesliga': 46.4,
 'Eredivisie': 16.7,
 'La Liga': 49.6,
 'Ligue 1': 39.3,
 'Premier League': 50.1,
 'Primeira Liga': 27.8,
 'Serie A': 40.5}

In [13]:
pickle.dump(avrg_sc_by_league_dict, open('./temporary_pkl/avrg_sc_by_league_dict.pkl', 'wb'))

In [14]:
list_short=club_score_df.club.to_list()

In [15]:
pkl_file = open('./temporary_pkl/list.pkl','rb')
list_long = pickle.load(pkl_file)
pkl_file.close()

In [16]:
pickle.dump(club_score_df, open('./temporary_pkl/club_score_df.pkl', 'wb'))

In [276]:
list_long

['Chelsea',
 'Manchester City',
 'Arsenal',
 'Manchester United',
 'Tottenham Hotspur',
 'Liverpool',
 'Southampton',
 'Swansea City',
 'Stoke City',
 'Crystal Palace',
 'Everton',
 'West Ham United',
 'West Bromwichlbion',
 'Leicester City',
 'Newcastle United',
 'Sunderland',
 'Aston Villa',
 'Bournemouth',
 'Watford',
 'Norwich City',
 'Juventus',
 'Roma',
 'Lazio',
 'Fiorentina',
 'Napoli',
 'Genoa C',
 'Sampdoria',
 'Inter Milan',
 'Torino',
 'AC Milan',
 'Palermo',
 'Sassuolo',
 'Hellas Verona',
 'Chievo Verona',
 'Empoli',
 'Udinese',
 'Atalanta',
 'Bologna',
 'Carpi',
 'Frosinone',
 'Paris Saint-Germain',
 'Lyon',
 'Monaco',
 'Marseille',
 'Saint-Étienne',
 'Girondins Bordeaux',
 'Montpellier',
 'Lille',
 'Stade Rennais',
 'EGuingamp',
 'Nice',
 'Bastia',
 'SM Caen',
 'Nantes',
 'Stade Reims',
 'Lorient',
 'Toulouse',
 'Ongers',
 'ES TroyesC',
 'Gjaccio',
 'Benfica',
 'Porto',
 'Sporting CP',
 'Braga',
 'Vitória Guimarães',
 'Belenenses SAD',
 'Nacional',
 'Paços de Ferreira',


In [277]:
club_score_df

Unnamed: 0,club,score
0,Manchester City,94.5
1,Liverpool,93.8
2,Bayern Munich,93.4
3,Paris Saint-Germain,90.5
4,Real Madrid,89.7
...,...,...
129,Twente Enschede,34.0
130,Emmen,32.0
131,RKC,30.8
132,Fortuna Sittard,28.2
