In [68]:
import pandas as pd
import numpy as np
import psycopg2
from scipy import stats

In [69]:
conn = psycopg2.connect('dbname=football_db')

In [70]:
cur = conn.cursor()

In [71]:
columns = ['id', 'home_goal', 'away_goal', 'result']

query = f"""
SELECT match_api_id, home_team_goal, away_team_goal, 
CASE WHEN home_team_goal > away_team_goal THEN 'Win' 
         ELSE 'NoWin' END as Result
FROM Match
"""

cur.execute(query)
data = cur.fetchall()

In [72]:
df = pd.DataFrame(data, columns=columns)

In [73]:
df.head()

Unnamed: 0,id,home_goal,away_goal,result
0,492473,1,1,NoWin
1,492474,0,0,NoWin
2,492475,0,3,NoWin
3,492476,5,0,Win
4,492477,1,3,NoWin


In [74]:
total_wins = len(df[df['result'] == 'Win'])
total_wins

11917

In [28]:
total_games = len(df['result'])
total_games

25979

In [29]:
mu = total_wins / total_games

In [30]:
mu

0.45871665576042187

In [39]:
sample_size = 2000
number_of_games = 2000
samples = np.zeros(sample_size)
for i in range(sample_size):
    games = df.iloc[np.random.randint(low=0, high=len(df), size=number_of_games), :]
    win_rate = len(games[games['result'] == 'Win']) / number_of_games
    samples[i] = win_rate


In [40]:
sample_mean = samples.mean()
sample_mean

0.458741

In [41]:
std = np.std(samples, ddof=1)
std

0.011369082945581479

In [42]:
t = (sample_mean - mu) / (std / np.sqrt(sample_size))
t

0.09576036135513998

In [43]:
df = sample_size - 1

In [44]:
t_crit = np.round(stats.t.ppf(1 - 0.05, df), 3)
t_crit

1.646

In [61]:
results = stats.ttest_1samp(a=samples, popmean=mu)
print(results)


if (results[0]>t_crit) and (results[1]<0.05):
    print ("Null hypothesis rejected. Results are statistically significant with t-value =", 
           round(results[0], 2), "and p-value =", np.round((results[1]), 4))
else:
    print ("Null hypothesis is Accepted")

Ttest_1sampResult(statistic=0.09576036135513998, pvalue=0.9237204980841935)
Null hypothesis is Accepted


In [None]:
columns = ['team_short_name', 'team_api_id', 'Category', 'Result', 'Count' ]

query = f"""
SELECT A.team_short_name as Team, B.* FROM
(SELECT T.team_api_id, 'Home' as Category,
CASE WHEN M1.home_team_goal - M1.away_team_goal > 0 THEN 'Win' ELSE 'No' END as Result,
COUNT(*) as Count
FROM Team T
JOIN Match M1 ON T.team_api_id = M1.home_team_api_id
GROUP BY T.team_api_id, Result
UNION SELECT T.team_api_id, 'Away' as Category,
CASE WHEN M1.home_team_goal - M1.away_team_goal < 0 THEN 'Win' ELSE 'No' END as Result,
COUNT(*) as Count
FROM Team T
JOIN Match M1 ON T.team_api_id = M1.away_team_api_id
GROUP BY T.team_api_id, Result) as B
JOIN Team A ON B.team_api_id = A.team_api_id
"""

cur.execute(query)
data = cur.fetchall()

In [None]:
df = pd.DataFrame(data, columns=columns)

In [None]:
df.head(5)

In [None]:
columns = ['team_short_name', 'team_api_id']

query = f"""
SELECT {', '.join(columns)}
FROM Team;
"""

In [None]:
cur.execute(query)
data = cur.fetchall()

In [None]:
team_df = pd.DataFrame(data, columns=columns)
team_df.head()

In [None]:
np.random.seed(0)
samples = df.iloc[np.random.randint(low=0, high=len(df), size=100), :]

In [None]:
mu = df[df['Home_Result'] == "Win"].count() / df.count()
mu

In [None]:
x_bar = samples[samples['Home_Result'] == "Win"].count() / samples.count()
x_bar

In [None]:
columns = ['Home_Result', 'Count']

query = f"""
SELECT Home_Result, COUNT(*) as Count FROM
(SELECT B.team_short_name as Home,  C.team_short_name as Away,  
(CASE WHEN M.home_team_goal - M.away_team_goal > 0 THEN 'Win' 
      WHEN  M.home_team_goal - M.away_team_goal = 0 THEN 'Draw'
      ELSE 'Lose' END) as Home_Result 
FROM Match M
JOIN Team B ON M.home_team_api_id = B.team_api_id
JOIN Team C ON M.away_team_api_id = C.team_api_id) AS T
GROUP BY Home_Result
"""

cur.execute(query)