In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import os


def diff_points(leagues):
    df_leagues = pd.DataFrame()
    df_results = pd.DataFrame()
    for league in leagues:
        data = pd.read_csv(os.path.join('data_leagues', league + '.csv'))
        data['forecast_home'] = data['forecast'].apply(lambda x: eval(x))
        data['forecast_away'] = data['forecast'].apply(lambda x: eval(x))
        data['goals'] = data['goals'].apply(lambda x: eval(x))

        data['xP_home'] = data['forecast_home'].apply(lambda f: 3 * float(f['w']) + 1 * float(f['d']))
        data['xP_away'] = data['forecast_away'].apply(lambda f: 3 * float(f['l']) + 1 * float(f['d']))
        data['results'] = data['goals'].apply(lambda f: 3 if float(f['h']) > float(f['a']) else 1 if float(f['h']) == float(f['a']) else 0)
        data['results_away'] = data['goals'].apply(lambda f: 3 if float(f['a']) > float(f['h']) else 1 if float(f['h']) == float(f['a']) else 0)

        data['league'] = league
        data['year'] = data['year']

        df_results = pd.concat([df_leagues, data[['league', 'year', 'results', 'results_away']]], ignore_index=True)

        results_home = data.groupby('year')['results'].sum()
        results_away = data.groupby('year')['results_away'].sum()
        xp_home = data.groupby('year')['xP_home'].sum()
        xp_away = data.groupby('year')['xP_away'].sum()
        
        df = pd.DataFrame()
        df["diff_points"] = results_home - results_away
        df["diff_points_xp"] = xp_home - xp_away
        df["league"] = league
        df["year"] = df.index
        df.reset_index(drop=True, inplace=True)
        df_leagues = pd.concat([df_leagues, df], ignore_index=True)

    df_leagues.set_index('league', inplace=True)
    df_leagues.set_index('year', append=True, inplace=True)
    return df_leagues, df_results
    #.relabel_index(["row 1", "row 2"], axis=0)

def style_formatter(leagues):
    df_leagues = diff_points(leagues)
    return df_leagues.style \
    .format(precision=0, thousands=".", decimal=",") \
    .format_index(str.upper, axis=1) \
    .bar(subset= ["diff_points", "diff_points_xp"], color = ['#b0351a', '#069215'], align=0, vmin=df_leagues["diff_points"].min(), vmax=df_leagues["diff_points"].max(), height=60,
         width=80, props="width: 200px; border-right: 1px solid black;")

#leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'RFPL', 'Ligue_1']

#style_formatter(leagues)

In [None]:
import pandas as pd
import os
import json
import scipy.stats as stats

def perform_statistical_tests_on_results(df):
    results = []
    grouped = df.groupby('league')
    for league, group in grouped:
        for year in group['year'].unique():
            subset = group[group['year'] == year]
            if len(subset) < 2:  
                continue
            
            home = subset['results']
            away = subset['results_away']
            
            wilcoxon_stat, wilcoxon_pvalue = stats.wilcoxon(home, away)
            
            results.append({
                'league': league,
                'year': year,
                'wilcoxon_stat': wilcoxon_stat,
                'wilcoxon_pvalue': wilcoxon_pvalue
            })
    
    results_df = pd.DataFrame(results)
    return results_df


leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'RFPL', 'Ligue_1']
_, df_leagues = diff_points(leagues)
results_df = perform_statistical_tests_on_results(df_leagues)

print(results_df)

    league  year  wilcoxon_stat  wilcoxon_pvalue
0  Ligue_1  2014        16261.5     4.195519e-05
1  Ligue_1  2015        15288.0     3.609347e-03
2  Ligue_1  2016        14350.0     3.670713e-07
3  Ligue_1  2017        15817.5     2.341290e-04
4  Ligue_1  2018        14363.0     4.159249e-04
5  Ligue_1  2019         7875.0     4.482011e-05
6  Ligue_1  2020        20306.0     9.527650e-01
7  Ligue_1  2021        16182.0     5.799687e-03
8  Ligue_1  2022        18062.5     2.514476e-02
9  Ligue_1  2023        11865.0     3.173105e-01
