In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from scipy.stats import kstest
from scipy.stats import mannwhitneyu
from sklearn.utils import resample

In [17]:
def merge_tables(_pit: pd.DataFrame, _results: pd.DataFrame, _status: pd.DataFrame) -> pd.DataFrame:
    """
    merges the data files
    :param _pit:
    :param _results:
    :param _status:
    :return:
    """
    _select_col = ['raceId','driverId','positionOrder', 'laps', 'time','statusId']
    mg_df = pd.merge(_pit, _results[_select_col], on=['raceId','driverId'], how = 'left', suffixes=('_pit', '_result'))
    mg_df = pd.merge(mg_df, _status, on='statusId', how='left')

    _status_select = [1, 11, 12, 13, 14, 15, 16, 17, 18, 19]
    mg_df.drop(mg_df[~mg_df['statusId'].isin(_status_select)].index, inplace=True)
    return mg_df


def process_laps_stops(mg_df: pd.DataFrame) -> pd.DataFrame:
    _total_laps = mg_df[(mg_df['positionOrder'] == 1) & (mg_df['stop'] == 1)].reset_index(drop=True)[['raceId', 'laps']]
    _total_laps.columns = [str(_total_laps.columns[0]), 'total_laps']
    _total_stops = mg_df.groupby(by=['raceId', 'driverId'], as_index=False)['stop'].max()
    _total_stops.columns = list(_total_stops.columns[:2]) + ['total_stops']

    mg_df = pd.merge(mg_df, _total_laps, on='raceId')
    mg_df = pd.merge(mg_df, _total_stops, on=['raceId', 'driverId'])
    mg_df['lap_prop'] = mg_df.apply(lambda x: x['lap']/x['total_laps'], axis=1)
    mg_df['abs_err'] = mg_df.apply(lambda x: abs(x['stop']/(x['total_stops']+1) - x['lap_prop']), axis=1)
    return mg_df


def get_err_mean(mg_df: pd.DataFrame):
    avg_err = pd.DataFrame(mg_df.groupby(['raceId', 'driverId'])['abs_err'].mean())
    avg_err = avg_err.add_suffix('_mean').reset_index()
    select_columns = ['raceId', 'driverId', 'positionOrder', 'abs_err_mean']
    test_df = mg_df.merge(avg_err, on=['raceId', 'driverId'])[select_columns]
    test_df = test_df.drop_duplicates()
    top_number = 5
    _df_front = test_df[test_df['positionOrder'] <= top_number]['abs_err_mean']
    _df_back = test_df[test_df['positionOrder'] > top_number]['abs_err_mean']
    return _df_front, _df_back


def group_by_stop_num(df: pd.DataFrame) -> dict:
    pit_stop_max = df['total_stops'].max()
    _df_dict = {}
    for i in range(1, pit_stop_max+1):
        _df_dict[i] = df[df['total_stops']==i][['stop','positionOrder','lap_prop']]
    return _df_dict


def df_to_list(mg_df: pd.DataFrame, select_col='lap_prop', max_pit = 3, top_num = 5):
    df_front = []
    df_back = []
    for i in range(1, max_pit+1):
        df_tmp = mg_df[mg_df['total_stops']==i]
        for j in range(1, i+1):
            df_select = df_tmp[df_tmp['stop']==j]
            df_front.append(df_select[df_select['positionOrder'] <= top_num][['stop', select_col]])
            df_back.append(df_select[df_select['positionOrder'] > top_num][['stop', select_col]])
    return df_front, df_back


def comparison_plot(list_1: [pd.DataFrame], list_2: [pd.DataFrame], select_col='lap_prop',
                    show_mean=True, show_description=True, divide=True, non_para=False, save_fig=False):
    """
    d
    :param non_para:
    :param divide:
    :param select_col:
    :param list_1:
    :param list_2:
    :param save_fig:
    :param show_description:
    :param show_mean:
    :return:
    """
    bins = np.linspace(0,1,50)
    color_bin = ['tab:blue', 'tab:orange', 'tab:red']
    color_bin2 = ['deepskyblue', 'crimson', 'lavender']

    plot_index = [[1,1], [2,1], [2,2], [3,1], [3,2], [3,3]]
    plot_num = 6

    for _i in range(plot_num):
        _total = plot_index[_i][0]  # total pit stops
        _pit = plot_index[_i][1]  # pit stop number
        df_f = list_1[_i][select_col]  # front
        df_b = resample(list_2[_i][select_col],
                        replace=True, n_samples=len(df_f), random_state=123)  # back

        plt.figure(figsize=(12,6))
        plt.hist(df_b, bins, alpha=0.8, color=color_bin[2])
        plt.hist(df_f, bins, alpha=0.8, color=color_bin[0])

        df_f_mean = round(df_f.mean(), ndigits=3)
        df_b_mean = round(df_b.mean(), ndigits=3)
        if show_mean:
            plt.axvline(x=df_f_mean, color=color_bin2[0], linewidth=4)
            plt.axvline(x=df_b_mean, color=color_bin2[1], linewidth=4)
            if divide: plt.axvline(x=_pit/(_total+1), color='gold', linewidth=4)
        if show_description:
            if not non_para:
                p_value = ttest_ind(df_f, df_b).pvalue
            else:
                p_value = kstest(df_f, df_b).pvalue
            print(f'Total Pits: {_total}, no.{_pit} pit, p value={p_value}')

        if save_fig: plt.savefig(f'image/hypo3/distribution_{_total}_{_pit}.png', transparent=False)
        plt.show()


def err_mean_plot(_df_front: pd.DataFrame, _df_back: pd.DataFrame, save_fig=False):
    df_f = _df_front
    df_b = resample(_df_back, replace=True, n_samples=len(df_f), random_state=123)

    bins = np.linspace(0,1,50)
    color_bin = ['tab:blue', 'tab:orange', 'tab:red']
    color_bin2 = ['deepskyblue', 'crimson', 'lavender']

    plt.figure(figsize=(12,6))
    plt.hist(df_b, bins, alpha=0.8, color=color_bin[2])
    plt.hist(df_f, bins, alpha=0.8, color=color_bin[0])

    df_f_mean = round(df_f.mean(), ndigits=3)
    df_b_mean = round(df_b.mean(), ndigits=3)

    plt.axvline(x=df_f_mean, color=color_bin2[0], linewidth=4)
    plt.axvline(x=df_b_mean, color=color_bin2[1], linewidth=4)

    p_value = kstest(df_f, df_b).pvalue
    print(f'KS test p value={p_value}')

    if save_fig: plt.savefig(f'image/hypo3/err_mean.png', transparent=False)
    plt.show()

In [18]:
# Load data
pit = pd.read_csv('data/pit_stops.csv')
results = pd.read_csv('data/results.csv')
status = pd.read_csv('data/status.csv')

In [19]:
# Process the data files
merge_df = merge_tables(pit, results, status)
merge_df = process_laps_stops(merge_df)
df_dict = group_by_stop_num(merge_df)

In [20]:
merge_df

Unnamed: 0,raceId,driverId,stop,lap,time_pit,duration,milliseconds,positionOrder,laps,time_result,statusId,status,total_laps,total_stops,lap_prop,abs_err
0,841,153,1,1,17:05:23,26.898,26898,11,57,\N,11,+1 Lap,58,3,0.017241,0.232759
1,841,153,2,17,17:31:06,24.463,24463,11,57,\N,11,+1 Lap,58,3,0.293103,0.206897
2,841,153,3,35,17:59:45,26.348,26348,11,57,\N,11,+1 Lap,58,3,0.603448,0.146552
3,841,17,1,11,17:20:48,23.426,23426,5,58,+38.171,1,Finished,58,3,0.189655,0.060345
4,841,17,2,26,17:44:29,22.520,22520,5,58,+38.171,1,Finished,58,3,0.448276,0.051724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7913,1076,847,1,23,15:38:51,18.479,18479,3,58,+25.593,1,Finished,58,1,0.396552,0.103448
7914,1076,4,1,39,16:05:24,18.796,18796,17,57,\N,11,+1 Lap,58,2,0.672414,0.339080
7915,1076,4,2,53,16:25:28,18.394,18394,17,57,\N,11,+1 Lap,58,2,0.913793,0.247126
7916,1076,825,1,39,16:05:30,19.111,19111,14,57,\N,11,+1 Lap,58,1,0.672414,0.172414


In [27]:
rank_list = [df_dict[i]['positionOrder'] for i in range(1,4)]

In [32]:
mannwhitneyu(rank_list[0], rank_list[1])

MannwhitneyuResult(statistic=1629878.5, pvalue=7.2111508478847934e-06)

In [33]:
mannwhitneyu(rank_list[0], rank_list[2])

MannwhitneyuResult(statistic=1157671.5, pvalue=4.758075330042359e-19)

In [34]:
mannwhitneyu(rank_list[1], rank_list[2])

MannwhitneyuResult(statistic=3285969.5, pvalue=1.3113144352403548e-09)

In [36]:
rank_list[0].mean()

8.521591871295513

In [37]:
rank_list[1].mean()

9.334433806536811

In [38]:
rank_list[2].mean()

10.248853689037098