In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
conn = sqlite3.connect('database.sqlite')
c = conn.cursor()
# Country Data COUNTRY table
COUNTRY_df = pd.read_sql_query("select * from COUNTRY;", conn)
# League data from LEAGUE
LEAGUE_df = pd.read_sql_query("select * from LEAGUE;", conn)
# Match data from Matches - just pulling 1 year for now -2015/2016 might be latest year
# You can remove restriction if you want
#MATCH_df = pd.read_sql_query("select * from MATCH where SEASON = '2015/2016';", conn)
# Match data from Matches - just pulling 1 and relevant data for now - we can remove restriction later if we want
MATCH_df = pd.read_sql_query("SELECT DATE, LEAGUE_ID, HOME_TEAM_API_ID, AWAY_TEAM_API_ID, HOME_TEAM_GOAL, AWAY_TEAM_GOAL FROM MATCH WHERE SEASON = '2015/2016';",
conn)

### We calculate the "differential" for both the home team and away team

In [2]:
# FIND MOST DOMINATE BY 

MATCH_df["HOME_DIFF"] = MATCH_df["home_team_goal"]- MATCH_df["away_team_goal"]
MATCH_df["AWAY_DIFF"] = MATCH_df["away_team_goal"]- MATCH_df["home_team_goal"]

MATCH_home_df=MATCH_df.loc[:,["date","home_team_api_id", "HOME_DIFF","league_id"]]
MATCH_away_df=MATCH_df.loc[:,["date","away_team_api_id", "AWAY_DIFF","league_id"]]



### We combine the home team data with the away team data to create 1 liast of games, with the differential

In [3]:
MATCH_home_df = MATCH_home_df.rename(columns={"home_team_api_id": "team_api_id","HOME_DIFF":"DIFF" })
MATCH_away_df = MATCH_away_df.rename(columns={"away_team_api_id": "team_api_id","AWAY_DIFF":"DIFF" })

frames = [MATCH_home_df, MATCH_away_df]
all_games_unsorted = pd.concat(frames)
all_games = all_games_unsorted.sort_values(by=["team_api_id","date"])
all_games.reset_index(inplace=True)
all_games.head() 

Unnamed: 0,index,date,team_api_id,DIFF,league_id
0,1995,2015-07-18 00:00:00,1601,-2,15722
1,2084,2015-07-24 00:00:00,1601,2,15722
2,2173,2015-08-03 00:00:00,1601,1,15722
3,2188,2015-08-10 00:00:00,1601,0,15722
4,2197,2015-08-16 00:00:00,1601,-1,15722


In [None]:
### We want to figure out the what the differential was for the next game as well.  
### We will make a copy of the table, and add "ng_" as a suffix to the columns for "next game"

In [5]:
next_game = all_games.copy()
next_game = next_game.rename(columns={"DIFF": "ng_DIFF",
                                      "team_api_id": "ng_team_api_id2",
                                      "league_id":   "ng_league_id",
                                      "date":   "ng_date"
                                                                     })

next_game.head()

Unnamed: 0,index,ng_date,ng_team_api_id2,ng_DIFF,ng_league_id
0,1995,2015-07-18 00:00:00,1601,-2,15722
1,2084,2015-07-24 00:00:00,1601,2,15722
2,2173,2015-08-03 00:00:00,1601,1,15722
3,2188,2015-08-10 00:00:00,1601,0,15722
4,2197,2015-08-16 00:00:00,1601,-1,15722


### We will merge the original data to the copy, but shift the copy data by one row, so it will line up with the next game

In [16]:
#df_complete = pd.merge(all_games, next_game.shift(-1).fillna(0).astype(int), left_index=True, right_index=True)
df_complete = pd.merge(all_games, next_game.shift(-1).fillna(0), left_index=True, right_index=True)
df_complete.head()

Unnamed: 0,index_x,date,team_api_id,DIFF,league_id,index_y,ng_date,ng_team_api_id2,ng_DIFF,ng_league_id
0,1995,2015-07-18 00:00:00,1601,-2,15722,2084.0,2015-07-24 00:00:00,1601.0,2.0,15722.0
1,2084,2015-07-24 00:00:00,1601,2,15722,2173.0,2015-08-03 00:00:00,1601.0,1.0,15722.0
2,2173,2015-08-03 00:00:00,1601,1,15722,2188.0,2015-08-10 00:00:00,1601.0,0.0,15722.0
3,2188,2015-08-10 00:00:00,1601,0,15722,2197.0,2015-08-16 00:00:00,1601.0,-1.0,15722.0
4,2197,2015-08-16 00:00:00,1601,-1,15722,2204.0,2015-08-22 00:00:00,1601.0,3.0,15722.0


### Oops!  For the last game of the season (where we don't have a next game), checking the next game would have returned the next team's first game.  We should just zero that out.      

In [10]:
#df.loc[df.First_name == 'Bill', 'name_match'] = 'Match' 
df_complete.sort_values(by=["team_api_id","date"]).head()
df_complete.loc[df_complete.team_api_id != df_complete.ng_team_api_id2, "ng_DIFF"] = 0
df_complete.head(2)

Unnamed: 0,index_x,date,team_api_id,DIFF,league_id,index_y,ng_date,ng_team_api_id2,ng_DIFF,ng_league_id
0,1995,2015-07-18 00:00:00,1601,-2,15722,2084.0,2015-07-24 00:00:00,1601.0,2.0,15722.0
1,2084,2015-07-24 00:00:00,1601,2,15722,2173.0,2015-08-03 00:00:00,1601.0,1.0,15722.0


### Let's delete all teh extra columns we created

In [20]:
df_complete.drop(['index_x','index_y','ng_date','ng_team_api_id2','ng_league_id'], axis=1, inplace=True)
df_complete.head()

Unnamed: 0,date,team_api_id,DIFF,league_id,ng_DIFF
0,2015-07-18 00:00:00,1601,-2,15722,2.0
1,2015-07-24 00:00:00,1601,2,15722,1.0
2,2015-08-03 00:00:00,1601,1,15722,0.0
3,2015-08-10 00:00:00,1601,0,15722,-1.0
4,2015-08-16 00:00:00,1601,-1,15722,3.0


### Now, lets find the most dominant team.  We will do that by grouping by the bext team


In [21]:
df_teams = df_complete.groupby(['league_id','team_api_id'], as_index=False).sum()
df_teams.head()


Unnamed: 0,league_id,team_api_id,DIFF,ng_DIFF
0,1,1773,-11,-10.0
1,1,8203,-2,-1.0
2,1,8342,34,33.0
3,1,8475,-17,-15.0
4,1,8571,-4,-3.0


### For each league, we will append the team that has the maximum Differential.  These are our dominant teams

In [27]:
leagues = df_teams.league_id.unique()
x = []
for l in leagues:
    df_test = df_teams[df_teams.league_id == l]
    x.append(df_test['team_api_id'].loc[df_test['DIFF'].idxmax()])
print(x)

[8342, 8586, 9847, 9823, 9885, 8593, 8673, 9772, 9925, 8634, 9931]
