In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import seaborn as sns

plt.rcParams['figure.figsize'] = 8, 7
plt.rcParams["font.weight"] = "bold"
plt.rcParams["font.family"] = "normal"
plt.rcParams["font.size"] = 25
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['xtick.labelsize']=18
plt.rcParams['ytick.labelsize']=18
plt.rcParams['axes.labelsize']=18
plt.rcParams['legend.fontsize']: 16
    
sns.set()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
### Setting Up DB Connection ###
db = 'database.sqlite'
conn = sqlite3.connect(db)

### Extracting All Available Tables From DB ###
db_tables = pd.read_sql("""SELECT * 
                            FROM sqlite_master
                            WHERE type='table';""", conn)
print(db_tables)

    type               name           tbl_name  rootpage                                                sql
0  table    sqlite_sequence    sqlite_sequence         4             CREATE TABLE sqlite_sequence(name,seq)
1  table  Player_Attributes  Player_Attributes        11  CREATE TABLE "Player_Attributes" (\n\t`id`\tIN...
2  table             Player             Player        14  CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA...
3  table              Match              Match        18  CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR...
4  table             League             League        24  CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA...
5  table            Country            Country        26  CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM...
6  table               Team               Team        29  CREATE TABLE "Team" (\n\t`id`\tINTEGER PRIMARY...
7  table    Team_Attributes    Team_Attributes         2  CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE...


In [4]:
# Leagues in each country # 
league_country = pd.read_sql("""SELECT c.name AS Country, l.name AS League
                            FROM League AS l
                            INNER JOIN Country AS c
                            ON l.country_id = c.id""", conn)
print(league_country)

        Country                    League
0       Belgium    Belgium Jupiler League
1       England    England Premier League
2        France            France Ligue 1
3       Germany     Germany 1. Bundesliga
4         Italy             Italy Serie A
5   Netherlands    Netherlands Eredivisie
6        Poland        Poland Ekstraklasa
7      Portugal  Portugal Liga ZON Sagres
8      Scotland   Scotland Premier League
9         Spain           Spain LIGA BBVA
10  Switzerland  Switzerland Super League


# From Here, We'll Focus on the Top 4 Leagues in the World (England, Germany, Italy, Spain)

In [15]:
# Average Number of Matches Played in Each League per Season (08/09 - 15/16)# 
avg_games = pd.read_sql("""SELECT c.name AS Country, l.name AS League, (COUNT(c.name) / COUNT(DISTINCT m.season)) AS Avg_Games_Season
                                FROM Match as m
                                INNER JOIN Country as c
                                ON m.country_id = c.id
                                INNER JOIN League as l
                                ON m.league_id = l.id
                                WHERE c.name IN ('England', 'Germany', 'Italy', 'Spain')
                                GROUP BY c.name
                                ORDER BY (COUNT(c.name) / COUNT(DISTINCT m.season)) DESC""", conn)
print(avg_games)

   Country                  League  Avg_Games_Season
0  England  England Premier League               380
1    Spain         Spain LIGA BBVA               380
2    Italy           Italy Serie A               377
3  Germany   Germany 1. Bundesliga               306


In [6]:
### Number of Teams Per League Per Season ###
team_count = pd.read_sql("""SELECT m.season, c.name AS Country, l.name AS League, COUNT (DISTINCT t.team_long_name) AS num_teams
                                FROM Match as m
                                INNER JOIN Country as c
                                ON m.country_id = c.id
                                INNER JOIN League as l
                                ON m.league_id = l.id
                                INNER JOIN Team as t
                                ON m.home_team_api_id = t.team_api_id
                                WHERE c.name IN ('England', 'Spain', 'Italy', 'Germany')
                                GROUP BY c.name, m.season
                                """, conn)
print(team_count.groupby('Country')['num_teams'].mean())

Country
England    20
Germany    18
Italy      20
Spain      20
Name: num_teams, dtype: int64


# Goal Scoring in Each League - For the Fans Who Love Goal Scoring 

In [13]:
### Average League Statistics ###
match_stats = pd.read_sql("""SELECT c.name AS Country, l.name as League, AVG(m.home_team_goal) AS home_goals, AVG(m.away_team_goal) AS away_goals, AVG(m.home_team_goal + m.away_team_goal) AS total_goals, 
                                ABS(AVG(m.home_team_goal - m.away_team_goal)) AS goal_diff
                                FROM Match as m
                                INNER JOIN Country as c
                                ON m.country_id = c.id
                                INNER JOIN League as l
                                ON m.league_id = l.id
                                WHERE c.name IN ('England', 'Germany', 'Italy', 'Spain')
                                GROUP BY c.name, l.name
                                ORDER BY AVG(m.home_team_goal + m.away_team_goal) DESC""", conn)
print(match_stats)

   Country                  League  home_goals  away_goals  total_goals  goal_diff
0  Germany   Germany 1. Bundesliga    1.626634    1.274918     2.901552   0.351716
1    Spain         Spain LIGA BBVA    1.631250    1.135855     2.767105   0.495395
2  England  England Premier League    1.550987    1.159539     2.710526   0.391447
3    Italy           Italy Serie A    1.500829    1.116009     2.616838   0.384819


**The German Bundesliga averaged the most goals per game over the seasons. The games were also the closest with the smallest difference in the average amount of goals scored by the home and away team.**

**Home field advantage was most prevalent in the Spanish league.**

In [14]:
### Which Season was the most exciting overall for the English, German, Spanish, and Italian Leagues? ###

season_stats = pd.read_sql("""SELECT m.season, SUM(m.home_team_goal + m.away_team_goal) AS goals_scored, AVG(m.home_team_goal + m.away_team_goal) AS goals_per_game
                                FROM Match as m
                                INNER JOIN Country as c 
                                ON m.country_id = c.id
                                WHERE c.name IN ('England', 'Germany', 'Italy', 'Spain')
                                GROUP BY m.season
                                ORDER BY SUM(m.home_team_goal + m.away_team_goal) DESC""", conn)
print(season_stats)

      season  goals_scored  goals_per_game
0  2013/2014          4099        2.834716
1  2012/2013          4055        2.804288
2  2010/2011          3954        2.734440
3  2009/2010          3942        2.726141
4  2008/2009          3925        2.714385
5  2011/2012          3916        2.750000
6  2015/2016          3914        2.706777
7  2014/2015          3845        2.660900


**The 2013/2014 season was the most exciting (in terms of goal scoring) overall for the English, German, Italian, and Spanish leagues**

# Which Teams Were the Most Dominate During this Period of Time in Each League? 

In [9]:
### Obtaining Matches From Each Season in England, Germany, Italy, and Spain ###
matches = pd.read_sql("""SELECT m.season, l.name, m.home_team_api_id, m.away_team_api_id, m.home_team_goal, m.away_team_goal
                            FROM Match as m
                            INNER JOIN League as l
                            ON m.league_id = l.id
                            WHERE l.name IN ('England Premier League', 'Germany 1. Bundesliga', 'Italy Serie A', 'Spain LIGA BBVA')""", conn)
print(matches.head(5))

# add win/loss/tie points for both home and away #
matches['home_points'] = ''
matches['away_points'] = ''
for idx, row in matches.iterrows():
    if row['home_team_goal'] > row['away_team_goal']:
        matches.loc[idx, 'home_points'] = 3
        matches.loc[idx, 'away_points'] = 0
    elif row['home_team_goal'] < row['away_team_goal']:
        matches.loc[idx, 'home_points'] = 0
        matches.loc[idx, 'away_points'] = 3
    else:
        matches.loc[idx, 'home_points'] = 1
        matches.loc[idx, 'away_points'] = 1

print('\nAdding points awarded to each team after game', '\n')
print(matches[['home_team_goal', 'away_team_goal', 'home_points', 'away_points']].head(5))

      season                    name  home_team_api_id  away_team_api_id  home_team_goal  away_team_goal
0  2008/2009  England Premier League             10260             10261               1               1
1  2008/2009  England Premier League              9825              8659               1               0
2  2008/2009  England Premier League              8472              8650               0               1
3  2008/2009  England Premier League              8654              8528               2               1
4  2008/2009  England Premier League             10252              8456               4               2

Adding points awarded to each team after game 

   home_team_goal  away_team_goal  home_points  away_points
0               1               1            1            1
1               1               0            3            0
2               0               1            0            3
3               2               1            3            0
4               4    

In [10]:
### Assigning Team Names based on api_id ###

teams = pd.read_sql("""SELECT team_api_id, team_long_name
                        FROM Team""", conn)

for idx, row in matches.iterrows():
    for idx2, row2 in teams.iterrows():
        if row['home_team_api_id'] == row2['team_api_id']:
            matches.loc[idx, 'home_team_api_id'] = row2['team_long_name']
        elif row['away_team_api_id'] == row2['team_api_id']:
            matches.loc[idx, 'away_team_api_id'] = row2['team_long_name']

matches.columns = ['season', 'league', 'home_team', 'away_team', 'home_team_goal', 'away_team_goal', 'home_points', 'away_points']
print(matches.head(3))

      season                  league          home_team             away_team  home_team_goal  away_team_goal  home_points  away_points
0  2008/2009  England Premier League  Manchester United      Newcastle United               1               1            1            1
1  2008/2009  England Premier League            Arsenal  West Bromwich Albion               1               0            3            0
2  2008/2009  England Premier League         Sunderland             Liverpool               0               1            0            3


In [11]:
# Home Points #
home = matches.groupby(['season', 'league', 'home_team'])['home_points'].sum()
home = home.reset_index()
home.columns = ['season', 'league', 'team', 'home_points']

# Away Point #
away = matches.groupby(['season', 'league', 'away_team'])['away_points'].sum()
away = away.reset_index()
away.columns = ['season', 'league', 'team', 'away_points']

### Total Points ###
total = pd.merge(home, away, on=['season', 'league', 'team'], how='inner')
total['total_points'] = total['home_points'] + total['away_points']
print(total)

        season                  league              team  home_points  away_points  total_points
0    2008/2009  England Premier League           Arsenal           38           34            72
1    2008/2009  England Premier League       Aston Villa           30           32            62
2    2008/2009  England Premier League  Blackburn Rovers           25           16            41
3    2008/2009  England Premier League  Bolton Wanderers           26           15            41
4    2008/2009  England Premier League           Chelsea           39           44            83
..         ...                     ...               ...          ...          ...           ...
619  2015/2016         Spain LIGA BBVA          SD Eibar           29           14            43
620  2015/2016         Spain LIGA BBVA        Sevilla FC           43            9            52
621  2015/2016         Spain LIGA BBVA     UD Las Palmas           29           15            44
622  2015/2016         Spain L

In [12]:
### Which Teams Won the Most League Titles?? ###
for league in list(total.league.unique()): 
    print('\nLeague: ', league)
    champions = []
    for season in list(total.season.unique()):
        result = total.loc[(total['league'] == league) & (total['season'] == season)]
        winner = result.loc[result['total_points'] == max(result['total_points'])]['team']
        champions.append(winner.unique()[0])
        most_frequent = max(set(champions), key=champions.count)
    print(most_frequent + ': ', str(champions.count(most_frequent)) + ' league titles')


League:  England Premier League
Manchester United:  3 league titles

League:  Germany 1. Bundesliga
FC Bayern Munich:  5 league titles

League:  Italy Serie A
Juventus:  5 league titles

League:  Spain LIGA BBVA
FC Barcelona:  6 league titles


# For Analytics For Each Respective League Check Out the Other Notebooks in This Repository!!