In [53]:
import pandas as pd
import matplotlib.pyplot as plt
import os


def diff_points(leagues):
    df_leagues = pd.DataFrame()
    df_results = pd.DataFrame()
    for league in leagues:
        data = pd.read_csv(os.path.join('data_leagues', league + '.csv'))
        data['forecast_home'] = data['forecast'].apply(lambda x: eval(x))
        data['forecast_away'] = data['forecast'].apply(lambda x: eval(x))
        data['goals'] = data['goals'].apply(lambda x: eval(x))

        data['xP_home'] = data['forecast_home'].apply(lambda f: 3 * float(f['w']) + 1 * float(f['d']))
        data['xP_away'] = data['forecast_away'].apply(lambda f: 3 * float(f['l']) + 1 * float(f['d']))
        data['results'] = data['goals'].apply(lambda f: 3 if float(f['h']) > float(f['a']) else 1 if float(f['h']) == float(f['a']) else 0)
        data['results_away'] = data['goals'].apply(lambda f: 3 if float(f['a']) > float(f['h']) else 1 if float(f['h']) == float(f['a']) else 0)

        data['xG_home'] = data['xG'].apply(lambda x: float(eval(x)['h']))
        data['xG_away'] = data['xG'].apply(lambda x: float(eval(x)['a']))

        data['league'] = league
        data['year'] = data['year']

        df_results = pd.concat([df_results, data[['league', 'year', 'results', 'results_away', 'xP_home', 'xP_away', 'xG_home', 'xG_away']]], ignore_index=True)

        results_home = data.groupby('year')['results'].sum()
        results_away = data.groupby('year')['results_away'].sum()
        xp_home = data.groupby('year')['xP_home'].sum()
        xp_away = data.groupby('year')['xP_away'].sum()
        
        df = pd.DataFrame()
        df["diff_points"] = results_home - results_away
        df["diff_points_xp"] = xp_home - xp_away
        df["league"] = league
        df["year"] = df.index
        df.reset_index(drop=True, inplace=True)
        df_leagues = pd.concat([df_leagues, df], ignore_index=True)

    df_leagues.set_index('league', inplace=True)
    df_leagues.set_index('year', append=True, inplace=True)
    return df_leagues, df_results
    #.relabel_index(["row 1", "row 2"], axis=0)

def style_formatter(leagues):
    df_leagues,_ = diff_points(leagues)
    return df_leagues.style \
    .format(precision=0, thousands=".", decimal=",") \
    .format_index(str.upper, axis=1) \
    .bar(subset= ["diff_points", "diff_points_xp"], color = ['#b0351a', '#069215'], align=0, vmin=df_leagues["diff_points"].min(), vmax=df_leagues["diff_points"].max(), height=60,
         width=80, props="width: 200px; border-right: 1px solid black;")

leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'RFPL', 'Ligue_1']

#style_formatter(leagues)

In [None]:
import pandas as pd
import os
import json
import scipy.stats as stats

import numpy as np
from scipy.stats import ttest_ind

def cohen_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    
    mean1, mean2 = np.mean(group1), np.mean(group2)
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    s_pooled = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
    
    d = (mean1 - mean2) / s_pooled
    return d

def perform_statistical_tests_on_results(df):
    results = []
    grouped = df.groupby('league')
    for league, group in grouped:
        for year in group['year'].unique():
            subset = group[group['year'] == year]
            if len(subset) < 2:  
                continue
            
            home_results = subset['results']
            away_results = subset['results_away']
            home_xPoints = subset['xP_home']
            away_xPoints = subset['xP_away']
            home_xG = subset['xG_home']
            away_xG = subset['xG_away']
            
            wilco_result, wilco_result_pvalue = stats.wilcoxon(home_results, away_results)
            wilco_xPoints, wilco_xPoints_pvalue = stats.wilcoxon(home_xPoints, away_xPoints)
            wilco_xG, wilco_xG_pvalue = stats.wilcoxon(home_xG, away_xG)
            
            results.append({
                'league': league,
                'year': year,
                'wilco-result': wilco_result,
                'wilco-result-pvalue': wilco_result_pvalue,
                'result-cohend': cohen_d(home_results, away_results),
                'wilco-xPoints' : wilco_xPoints,
                'wilco-xPoints-pvalue': wilco_xPoints_pvalue,
                'xPoints-cohend': cohen_d(home_xPoints, away_xPoints),
                'wilco-xG': wilco_xG,
                'wilco-xG-pvalue': wilco_xG_pvalue,
                'xG-cohend' : cohen_d(home_xG, away_xG)
            })
    
    results_df = pd.DataFrame(results)
    return results_df


leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'RFPL', 'Ligue_1']
_, df_leagues = diff_points(leagues)
results_df = perform_statistical_tests_on_results(df_leagues)

print(results_df)

        league  year  wilco-result  wilco-result-pvalue  result-cohend  \
0   Bundesliga  2014        8887.5         1.034611e-05       0.512082   
1   Bundesliga  2015       11800.0         2.242164e-02       0.259479   
2   Bundesliga  2016        9553.0         8.028642e-06       0.519632   
3   Bundesliga  2017        9408.0         2.304397e-04       0.423421   
4   Bundesliga  2018       11115.0         4.847095e-03       0.321458   
5   Bundesliga  2019       13742.5         6.040655e-01       0.058499   
6   Bundesliga  2020       10848.0         2.780690e-02       0.249430   
7   Bundesliga  2021       10530.0         5.163165e-04       0.398909   
8   Bundesliga  2022        9976.0         1.036371e-04       0.448062   
9   Bundesliga  2023       10283.0         4.148197e-03       0.326799   
10         EPL  2014       16560.0         7.665370e-04       0.345195   
11         EPL  2015       15892.0         1.308554e-02       0.252358   
12         EPL  2016       16186.5    