# Test Field

In [82]:
import datetime
import re
from typing import List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon

from sklearn.utils import resample

import final_func as fn

In [83]:
import importlib
importlib.reload(fn)

<module 'final_func' from '/Users/andrewmo/Documents/Docs - Jupiter/Projects/Class Individuals/2022Spring_Finals/final_func.py'>

In [48]:
%load_ext Cython

In [49]:
def process_data(mg_df: pd.DataFrame, normal_status=True, totals=True, deviation=True, cython=True) -> pd.DataFrame:
    # 1. filtering normal status
    if normal_status:
        _status_select = [1, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        mg_df.drop(mg_df[~mg_df['statusId'].isin(_status_select)].index, inplace=True)
    # 2&3. add total laps & total pit stops for each record
    if totals:
        _total_laps = mg_df[(mg_df['positionOrder'] == 1) & (mg_df['stop'] == 1)].reset_index(drop=True)[
            ['raceId', 'laps']]
        _total_laps.columns = [str(_total_laps.columns[0]), 'total_laps']
        _total_stops = mg_df.groupby(by=['raceId', 'driverId'], as_index=False)['stop'].max()
        _total_stops.columns = list(_total_stops.columns[:2]) + ['total_stops']
        mg_df = pd.merge(mg_df, _total_laps, on='raceId')
        mg_df = pd.merge(mg_df, _total_stops, on=['raceId', 'driverId'])
        # 4. calculate the proportion of lap when the driver pit for each pit record
        mg_df['lap_prop'] = mg_df.apply(lambda x: x['lap'] / x['total_laps'], axis=1)
        if deviation:
            # 5. calculate how far the lap proportion deviates from the ideal even distribution for each pit record
            if cython:
                mg_df['abs_deviation'] = mg_df.apply(lambda x: abs_deviation_cal(x['stop'], x['total_stops'], x['lap_prop']), axis=1)
            else:
                mg_df['abs_deviation'] = mg_df.apply(lambda x: abs(x['stop'] / (x['total_stops'] + 1) - x['lap_prop']),
                                                 axis=1)
            # 6. deviation mean, grouped by each driver in each race
            avg_deviation = pd.DataFrame(mg_df.groupby(['raceId', 'driverId'])['abs_deviation'].mean())
            avg_deviation = avg_deviation.add_suffix('_mean').reset_index()
            mg_df = pd.merge(mg_df, avg_deviation, on=['raceId', 'driverId'])
    return mg_df

In [84]:
# Load data
pit = pd.read_csv('data/pit_stops.csv')
results = pd.read_csv('data/results.csv')
status = pd.read_csv('data/status.csv')

In [64]:
# Process the data files
merge_df = fn.merge_data([pit, results, status])

In [65]:
%%cython
def abs_deviation_cal(_stop, _total, _lap_prop):
    cdef
    abs_dev = abs(_stop / (_total + 1) - _lap_prop)
    return abs_dev

In [68]:
def process_data_C(mg_df: pd.DataFrame, normal_status=True, totals=True, deviation=True) -> pd.DataFrame:
    # 1. filtering normal status
    if normal_status:
        _status_select = [1, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        mg_df.drop(mg_df[~mg_df['statusId'].isin(_status_select)].index, inplace=True)
    # 2&3. add total laps & total pit stops for each record
    if totals:
        _total_laps = mg_df[(mg_df['positionOrder'] == 1) & (mg_df['stop'] == 1)].reset_index(drop=True)[
            ['raceId', 'laps']]
        _total_laps.columns = [str(_total_laps.columns[0]), 'total_laps']
        _total_stops = mg_df.groupby(by=['raceId', 'driverId'], as_index=False)['stop'].max()
        _total_stops.columns = list(_total_stops.columns[:2]) + ['total_stops']
        mg_df = pd.merge(mg_df, _total_laps, on='raceId')
        mg_df = pd.merge(mg_df, _total_stops, on=['raceId', 'driverId'])
        # 4. calculate the proportion of lap when the driver pit for each pit record
        mg_df['lap_prop'] = mg_df.apply(lambda x: x['lap'] / x['total_laps'], axis=1)
        if deviation:
            # 5. calculate how far the lap proportion deviates from the ideal even distribution for each pit record
            mg_df['abs_deviation'] = mg_df.apply(lambda x: abs_deviation_cal(x['stop'], x['total_stops'], x['lap_prop']), axis=1)
            # 6. deviation mean, grouped by each driver in each race
            avg_deviation = pd.DataFrame(mg_df.groupby(['raceId', 'driverId'])['abs_deviation'].mean())
            avg_deviation = avg_deviation.add_suffix('_mean').reset_index()
            mg_df = pd.merge(mg_df, avg_deviation, on=['raceId', 'driverId'])
    return mg_df

In [79]:
%timeit -r 50 -n 20 process_data(merge_df)

202 ms ± 9.83 ms per loop (mean ± std. dev. of 50 runs, 20 loops each)


In [80]:
%timeit -r 50 -n 20 process_data_C(merge_df)

199 ms ± 15.5 ms per loop (mean ± std. dev. of 50 runs, 20 loops each)


In [85]:
# Process the data files
merge_df = fn.merge_data([pit, results, status])
merge_df = fn.process_data(merge_df)
df_dict = fn.pit_stop_group(merge_df)

In [89]:
_df_dict =df_dict

max_num_of_stops = 3  # consider only total pit stops = 1,2,3
for ps_num in range(1, max_num_of_stops + 1):
    _df_tmp = _df_dict[ps_num]  # get dataframe of total pit stop = ps_num
    # _df_list: [<df: no.1 pit stop out of ps_num>, <df: no.2 pit stop out of ps_num>, ...]
    _df_list = [_df_tmp[_df_tmp['stop'] == i]['lap_prop'] for i in range(1, ps_num + 1)]

    print('Total Pit Stops: ', ps_num)
    plot_count = 0

    for df in _df_list:
        # mean, std calculation
        df_mean = round(df.mean(), ndigits=3)
        df_std = round(df.std(), ndigits=3)
        # show mean line (x = mean)
        # even dividing point:
        even_divide = (plot_count + 1) / (ps_num + 1)
        plot_count += 1
        # show distribution description
        if True:
            print('No. ', plot_count, ' pit stop: ', 'mean = ', df_mean, ' std = ', df_std)
            perc_1 = len(df[(df <= df_mean + df_std) & (df >= df_mean - df_std)]) / len(df)
            perc_2 = len(df[(df <= df_mean + 2 * df_std) & (df >= df_mean - 2 * df_std)]) / len(df)
            perc_1 = round(100 * perc_1, ndigits=1)
            perc_2 = round(100 * perc_2, ndigits=1)
            print(f'    {perc_1}% within mean ± 1 std')
            print(f'    {perc_2}% within mean ± 2 std')
            t_test = wilcoxon(df-even_divide)
            print(f'     One sample Test, mu={round(even_divide, ndigits=3)}')
            print(t_test)

Total Pit Stops:  1
No.  1  pit stop:  mean =  0.416  std =  0.147
    66.9% within mean ± 1 std
    95.7% within mean ± 2 std
     One sample Test, mu=0.5
WilcoxonResult(statistic=138257.0, pvalue=3.796253145662607e-68)
Total Pit Stops:  2
No.  1  pit stop:  mean =  0.252  std =  0.118
    71.7% within mean ± 1 std
    95.8% within mean ± 2 std
     One sample Test, mu=0.333
WilcoxonResult(statistic=173128.5, pvalue=3.861338109743407e-119)
No.  2  pit stop:  mean =  0.62  std =  0.134
    72.9% within mean ± 1 std
    93.1% within mean ± 2 std
     One sample Test, mu=0.667
WilcoxonResult(statistic=336365.0, pvalue=1.8738849390663688e-42)
Total Pit Stops:  3
No.  1  pit stop:  mean =  0.174  std =  0.098
    66.6% within mean ± 1 std
    97.5% within mean ± 2 std
     One sample Test, mu=0.25
WilcoxonResult(statistic=39831.5, pvalue=2.942177141115048e-72)
No.  2  pit stop:  mean =  0.427  std =  0.155
    74.4% within mean ± 1 std
    92.6% within mean ± 2 std
     One sample Test, mu