In [53]:
import pandas as pd
import matplotlib.pyplot as plt
import os


def diff_points(leagues):
    df_leagues = pd.DataFrame()
    df_results = pd.DataFrame()
    for league in leagues:
        data = pd.read_csv(os.path.join('data_leagues', league + '.csv'))
        data['forecast_home'] = data['forecast'].apply(lambda x: eval(x))
        data['forecast_away'] = data['forecast'].apply(lambda x: eval(x))
        data['goals'] = data['goals'].apply(lambda x: eval(x))

        data['xP_home'] = data['forecast_home'].apply(lambda f: 3 * float(f['w']) + 1 * float(f['d']))
        data['xP_away'] = data['forecast_away'].apply(lambda f: 3 * float(f['l']) + 1 * float(f['d']))
        data['results'] = data['goals'].apply(lambda f: 3 if float(f['h']) > float(f['a']) else 1 if float(f['h']) == float(f['a']) else 0)
        data['results_away'] = data['goals'].apply(lambda f: 3 if float(f['a']) > float(f['h']) else 1 if float(f['h']) == float(f['a']) else 0)

        data['xG_home'] = data['xG'].apply(lambda x: float(eval(x)['h']))
        data['xG_away'] = data['xG'].apply(lambda x: float(eval(x)['a']))

        data['league'] = league
        data['year'] = data['year']

        df_results = pd.concat([df_results, data[['league', 'year', 'results', 'results_away', 'xP_home', 'xP_away', 'xG_home', 'xG_away']]], ignore_index=True)

        results_home = data.groupby('year')['results'].sum()
        results_away = data.groupby('year')['results_away'].sum()
        xp_home = data.groupby('year')['xP_home'].sum()
        xp_away = data.groupby('year')['xP_away'].sum()
        
        df = pd.DataFrame()
        df["diff_points"] = results_home - results_away
        df["diff_points_xp"] = xp_home - xp_away
        df["league"] = league
        df["year"] = df.index
        df.reset_index(drop=True, inplace=True)
        df_leagues = pd.concat([df_leagues, df], ignore_index=True)

    df_leagues.set_index('league', inplace=True)
    df_leagues.set_index('year', append=True, inplace=True)
    return df_leagues, df_results
    #.relabel_index(["row 1", "row 2"], axis=0)

def style_formatter(leagues):
    df_leagues,_ = diff_points(leagues)
    return df_leagues.style \
    .format(precision=0, thousands=".", decimal=",") \
    .format_index(str.upper, axis=1) \
    .bar(subset= ["diff_points", "diff_points_xp"], color = ['#b0351a', '#069215'], align=0, vmin=df_leagues["diff_points"].min(), vmax=df_leagues["diff_points"].max(), height=60,
         width=80, props="width: 200px; border-right: 1px solid black;")

leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'RFPL', 'Ligue_1']

#style_formatter(leagues)

In [None]:
import pandas as pd
import os
import json
import scipy.stats as stats

def perform_statistical_tests_on_results(df):
    results = []
    grouped = df.groupby('league')
    for league, group in grouped:
        for year in group['year'].unique():
            subset = group[group['year'] == year]
            if len(subset) < 2:  
                continue
            
            home_results = subset['results']
            away_results = subset['results_away']
            home_xPoints = subset['xP_home']
            away_xPoints = subset['xP_away']
            home_xG = subset['xG_home']
            away_xG = subset['xG_away']
            
            wilco_result, wilco_result_pvalue = stats.wilcoxon(home_results, away_results)
            wilco_xPoints, wilco_xPoints_pvalue = stats.wilcoxon(home_xPoints, away_xPoints)
            wilco_xG, wilco_xG_pvalue = stats.wilcoxon(home_xG, away_xG)
            
            results.append({
                'league': league,
                'year': year,
                'wilco-result': wilco_result,
                'wilco-result-pvalue': wilco_result_pvalue,
                'wilco-xPoints' : wilco_xPoints,
                'wilco-xPoints-pvalue': wilco_xPoints_pvalue,
                'wilco-xG': wilco_xG,
                'wilco-xG-pvalue': wilco_xG_pvalue
            })
    
    results_df = pd.DataFrame(results)
    return results_df


leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'RFPL', 'Ligue_1']
_, df_leagues = diff_points(leagues)
results_df = perform_statistical_tests_on_results(df_leagues)

print(results_df)

0     15014.0
1     17430.0
2     15813.0
3     14736.0
4     16636.0
5     20551.0
6     20344.0
7     17281.0
8     15673.0
9     16040.0
10    21473.0
11    27235.0
12    25923.0
13    24922.0
14    27516.0
15    29618.0
16    32998.0
17    28085.0
18    25798.5
19    23174.0
20    23934.0
21    25216.0
22    23064.0
23    20500.0
24    21746.0
25    23337.0
26    29153.0
27    25516.0
28    21496.0
29    21214.0
30    24379.0
31    23010.0
32    22658.0
33    24357.0
34    24923.0
35    13308.0
36    32028.0
37    26604.0
38    30293.0
39    15165.0
40     9764.0
41    10568.0
42     9801.0
43     9679.0
44     9549.0
45    10919.0
46    10528.0
47    11441.0
48    10918.0
49     9617.0
50    24711.0
51    23897.0
52    25511.0
53    29923.0
54    24706.0
55    28480.0
56    31022.0
57    30629.0
58    28778.0
59    27535.0
Name: wilco-xG, dtype: float64
