In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

In [132]:
def merge_tables(_pit: pd.DataFrame, _results: pd.DataFrame, _status: pd.DataFrame) -> pd.DataFrame:
    """
    merges the data files
    :param _pit:
    :param _results:
    :param _status:
    :return:
    """
    _select_col = ['raceId','driverId','positionOrder', 'laps', 'time','statusId']
    mg_df = pd.merge(_pit, _results[_select_col], on=['raceId','driverId'], how = 'left', suffixes=('_pit', '_result'))
    mg_df = pd.merge(mg_df, _status, on='statusId', how='left')

    _status_select = [1, 11, 12, 13, 14, 15, 16, 17, 18, 19]
    mg_df.drop(mg_df[~mg_df['statusId'].isin(_status_select)].index, inplace=True)
    return mg_df


def process_laps_stops(mg_df: pd.DataFrame) -> pd.DataFrame:
    _total_laps = mg_df[(mg_df['positionOrder'] == 1) & (mg_df['stop'] == 1)].reset_index(drop=True)[['raceId', 'laps']]
    _total_laps.columns = [str(_total_laps.columns[0]), 'total_laps']
    _total_stops = mg_df.groupby(by=['raceId', 'driverId'], as_index=False)['stop'].max()
    _total_stops.columns = list(_total_stops.columns[:2]) + ['total_stops']

    mg_df = pd.merge(mg_df, _total_laps, on='raceId')
    mg_df = pd.merge(mg_df, _total_stops, on=['raceId', 'driverId'])
    mg_df['lap_prop'] = mg_df.apply(lambda x: x['lap']/x['total_laps'], axis=1)

    return mg_df


def group_by_stop_num(df: pd.DataFrame) -> dict:
    pit_stop_max = df['total_stops'].max()
    _df_dict = {}
    for i in range(1, pit_stop_max+1):
        _df_dict[i] = df[df['total_stops']==i][['stop','lap_prop']]
    return _df_dict


def distribution_plot(_df_dict: dict, show_mean=True, show_description=True, save_fig=False):
    """
    d
    :param save_fig:
    :param show_description:
    :param show_mean:
    :param _df_dict:
    :return:
    """
    bins = np.linspace(0,1,50)
    color_bin = ['tab:blue', 'tab:orange', 'tab:red']
    color_bin2 = ['cyan', 'yellow', 'lavender']
    max_num_of_stops = 3

    for ps_num in range(1, max_num_of_stops+1):
        plt.figure(figsize=(8,6))
        df_tmp = _df_dict[ps_num]
        df_list = [df_tmp[df_tmp['stop'] == i]['lap_prop'] for i in range(1, ps_num+1)]
        plot_count = 0
        if show_description: print('////////////////////////////////////////////////////////////////////////////////////////')
        print('Total Pit Stops: ', ps_num)
        for df in df_list:
            plt.hist(df, bins, alpha=1, color=color_bin[plot_count])
            df_mean = round(df.mean(), ndigits=3)
            df_std = round(df.std(), ndigits=3)
            if show_mean: plt.axvline(x=df_mean, color=color_bin2[plot_count])
            plot_count += 1
            if not show_description: continue
            print('No. ', plot_count, ' pit stop: ', 'mean = ', df_mean, ' std = ', df_std)
            perc_1 = len(df[(df <= df_mean + df_std) & (df >= df_mean - df_std)])/len(df)
            perc_2 = len(df[(df <= df_mean + 2*df_std) & (df >= df_mean - 2*df_std)])/len(df)
            perc_1 = round(100*perc_1, ndigits=1)
            perc_2 = round(100*perc_2, ndigits=1)
            print(f'    {perc_1}% within mean ± 1 std')
            print(f'    {perc_2}% within mean ± 2 std')
        if save_fig: plt.savefig(f'image/hypo2/distribution_{ps_num}.png', transparent=False)
        plt.show()

In [133]:
# Load data
pit = pd.read_csv('dataset/pit_stops.csv')
results = pd.read_csv('dataset/results.csv')
status = pd.read_csv('dataset/status.csv')

In [134]:
# Process the data files
merge_df = merge_tables(pit, results, status)
merge_df = process_laps_stops(merge_df)
df_dict = group_by_stop_num(merge_df)

In [137]:
# Plot distribution of Pit Stops, grouped by total number of pit stops
# distribution_plot(df_dict, show_mean=False, show_description=False)

In [None]:
# Adding description for the distribution plot
distribution_plot(df_dict, save_fig=True)

In [138]:
_df_dict = df_dict

df_list = []

for ps_num in range(1,4):
    df_tmp = _df_dict[ps_num]
    df_list += [df_tmp[df_tmp['stop'] == i]['lap_prop'] for i in range(1, ps_num+1)]

In [149]:
ttest_ind(df_list[1], df_list[2])

0.0

In [145]:
ttest_ind(df_list[3], df_list[4])

Ttest_indResult(statistic=-43.56438449947029, pvalue=1.0935703077967233e-265)

In [146]:
ttest_ind(df_list[3], df_list[5])

Ttest_indResult(statistic=-98.13031620949947, pvalue=0.0)

In [147]:
ttest_ind(df_list[4], df_list[5])

Ttest_indResult(statistic=-44.32175638822851, pvalue=6.000634849695472e-272)