# Test Field

In [90]:
import datetime
import re
from typing import List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
from sklearn.utils import resample

from sklearn.utils import resample

import final_func as fn

In [83]:
import importlib
importlib.reload(fn)

<module 'final_func' from '/Users/andrewmo/Documents/Docs - Jupiter/Projects/Class Individuals/2022Spring_Finals/final_func.py'>

In [94]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [49]:
def process_data(mg_df: pd.DataFrame, normal_status=True, totals=True, deviation=True, cython=True) -> pd.DataFrame:
    # 1. filtering normal status
    if normal_status:
        _status_select = [1, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        mg_df.drop(mg_df[~mg_df['statusId'].isin(_status_select)].index, inplace=True)
    # 2&3. add total laps & total pit stops for each record
    if totals:
        _total_laps = mg_df[(mg_df['positionOrder'] == 1) & (mg_df['stop'] == 1)].reset_index(drop=True)[
            ['raceId', 'laps']]
        _total_laps.columns = [str(_total_laps.columns[0]), 'total_laps']
        _total_stops = mg_df.groupby(by=['raceId', 'driverId'], as_index=False)['stop'].max()
        _total_stops.columns = list(_total_stops.columns[:2]) + ['total_stops']
        mg_df = pd.merge(mg_df, _total_laps, on='raceId')
        mg_df = pd.merge(mg_df, _total_stops, on=['raceId', 'driverId'])
        # 4. calculate the proportion of lap when the driver pit for each pit record
        mg_df['lap_prop'] = mg_df.apply(lambda x: x['lap'] / x['total_laps'], axis=1)
        if deviation:
            # 5. calculate how far the lap proportion deviates from the ideal even distribution for each pit record
            if cython:
                mg_df['abs_deviation'] = mg_df.apply(lambda x: abs_deviation_cal(x['stop'], x['total_stops'], x['lap_prop']), axis=1)
            else:
                mg_df['abs_deviation'] = mg_df.apply(lambda x: abs(x['stop'] / (x['total_stops'] + 1) - x['lap_prop']),
                                                 axis=1)
            # 6. deviation mean, grouped by each driver in each race
            avg_deviation = pd.DataFrame(mg_df.groupby(['raceId', 'driverId'])['abs_deviation'].mean())
            avg_deviation = avg_deviation.add_suffix('_mean').reset_index()
            mg_df = pd.merge(mg_df, avg_deviation, on=['raceId', 'driverId'])
    return mg_df

In [84]:
# Load data
pit = pd.read_csv('data/pit_stops.csv')
results = pd.read_csv('data/results.csv')
status = pd.read_csv('data/status.csv')

In [64]:
# Process the data files
merge_df = fn.merge_data([pit, results, status])

In [95]:
%reload_ext Cython

In [98]:
%%cython --annotate
def abs_deviation_cal(_stop, _total, _lap_prop):
    abs_dev = abs(_stop / (_total + 1) - _lap_prop)
    return abs_dev

In [68]:
def process_data_C(mg_df: pd.DataFrame, normal_status=True, totals=True, deviation=True) -> pd.DataFrame:
    # 1. filtering normal status
    if normal_status:
        _status_select = [1, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        mg_df.drop(mg_df[~mg_df['statusId'].isin(_status_select)].index, inplace=True)
    # 2&3. add total laps & total pit stops for each record
    if totals:
        _total_laps = mg_df[(mg_df['positionOrder'] == 1) & (mg_df['stop'] == 1)].reset_index(drop=True)[
            ['raceId', 'laps']]
        _total_laps.columns = [str(_total_laps.columns[0]), 'total_laps']
        _total_stops = mg_df.groupby(by=['raceId', 'driverId'], as_index=False)['stop'].max()
        _total_stops.columns = list(_total_stops.columns[:2]) + ['total_stops']
        mg_df = pd.merge(mg_df, _total_laps, on='raceId')
        mg_df = pd.merge(mg_df, _total_stops, on=['raceId', 'driverId'])
        # 4. calculate the proportion of lap when the driver pit for each pit record
        mg_df['lap_prop'] = mg_df.apply(lambda x: x['lap'] / x['total_laps'], axis=1)
        if deviation:
            # 5. calculate how far the lap proportion deviates from the ideal even distribution for each pit record
            mg_df['abs_deviation'] = mg_df.apply(lambda x: abs_deviation_cal(x['stop'], x['total_stops'], x['lap_prop']), axis=1)
            # 6. deviation mean, grouped by each driver in each race
            avg_deviation = pd.DataFrame(mg_df.groupby(['raceId', 'driverId'])['abs_deviation'].mean())
            avg_deviation = avg_deviation.add_suffix('_mean').reset_index()
            mg_df = pd.merge(mg_df, avg_deviation, on=['raceId', 'driverId'])
    return mg_df

In [79]:
%timeit -r 50 -n 20 process_data(merge_df)

202 ms ± 9.83 ms per loop (mean ± std. dev. of 50 runs, 20 loops each)


In [80]:
%timeit -r 50 -n 20 process_data_C(merge_df)

199 ms ± 15.5 ms per loop (mean ± std. dev. of 50 runs, 20 loops each)
