In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chardet

In [206]:
# Load data from csv file (not the header)
race_data_22 = np.genfromtxt('2022 Lourdes Mens Elite Times.csv', 
                             delimiter = ',', 
                             skip_header = 1, 
                             autostrip = True,)
# View the data
race_data_22

array([[1.00000e+00, 6.00000e+00,         nan, ...,         nan,
        0.00000e+00, 2.00000e+02],
       [2.00000e+00, 1.40000e+01,         nan, ...,         nan,
        8.47000e-01, 1.60000e+02],
       [3.00000e+00, 1.00000e+00,         nan, ...,         nan,
        1.08600e+00, 1.40000e+02],
       ...,
       [5.80000e+01, 3.40000e+01,         nan, ...,         nan,
        3.91000e+01, 3.00000e+00],
       [5.90000e+01, 2.00000e+00,         nan, ...,         nan,
        5.68730e+01, 2.00000e+00],
       [6.00000e+01, 6.00000e+01,         nan, ...,         nan,
        1.01588e-03, 1.00000e+00]])

In [207]:
# Load the top row (headers) of the data as strings and display them
header_main = np.genfromtxt('2022 Lourdes Mens Elite Times.csv', 
                            delimiter = ',', 
                            skip_footer = race_data_22.shape[0], 
                            dtype = str)
header_main

array(['Position', 'World Ranking', 'Name', 'Team', 'UCI Number',
       'Nationality', 'YOB', 'Speed', 'Speed Ranking', 'Split 1',
       'Split 1 pos', 'Split 2', 'Split 2 pos', 'Split 3', 'Split 3 pos',
       'Split 4', 'Split 4 pos', 'Time', 'TOW', 'Points'], dtype='<U13')

In [208]:
# Create a temporary_mean variable that stores the nanmean of each column
temporary_mean = np.nanmean(race_data_22, axis = 0)

  temporary_mean = np.nanmean(race_data_22, axis = 0)


In [209]:
# Stores the index values from the temporary_mean array where the value is 'nan'. 
# This represents the index of the column which stores string data in the main data set
column_strings = np.argwhere(np.isnan(temporary_mean)).squeeze()
column_strings

array([ 2,  3,  5,  9, 11, 13, 15, 17], dtype=int64)

In [210]:
# Stores the index values from the temporary_mean array where the value is not 'nan'. 
# This represents the index of the column which stores numeric data in the main data set
column_numeric = np.argwhere(np.isnan(temporary_mean) == False).squeeze()
column_numeric

array([ 0,  1,  4,  6,  7,  8, 10, 12, 14, 16, 18, 19], dtype=int64)

In [211]:
# Load only the columns containing strings from the csv file (not headers)
race_data_string_22 = np.genfromtxt('2022 Lourdes Mens Elite Times.csv',
                                     delimiter = ',',
                                     skip_header = 1,
                                     usecols = column_strings,
                                     autostrip = True,
                                     dtype = str,
                                     filling_values = 'N/A')

In [212]:
# Load only the headers of the columns containing strings data into a seperate variable
header_strings = np.genfromtxt('2022 Lourdes Mens Elite Times.csv',
                               delimiter = ',',
                               skip_footer = race_data_22.shape[0],
                               usecols = column_strings,
                               dtype = str)
header_strings

array(['Name', 'Team', 'Nationality', 'Split 1', 'Split 2', 'Split 3',
       'Split 4', 'Time'], dtype='<U11')

In [213]:
# Load only the columns containing numeric data into a seperate variable
race_data_numeric_22 = np.genfromtxt('2022 Lourdes Mens Elite Times.csv',
                                     delimiter = ',',
                                     skip_header = 1,
                                     usecols = column_numeric,
                                     autostrip = True,
                                     filling_values = temporary_fill,
                                     dtype = [int]*2 + [np.int64] + [int] + [float] + [int]*4)
race_data_numeric_22

array([( 1,   6, 10008827283, 1996, 60.335, 11,  1,  7,  1),
       ( 2,  14, 10090907774, 1999, 59.081, 21,  2,  1,  2),
       ( 3,   1, 10007544358, 1994, 58.824, 25,  4,  3,  3),
       ( 4,   8, 10008194359, 1994, 59.081, 21,  7,  5,  6),
       ( 5,  17, 10008813442, 1996, 61.224,  3,  6,  4,  4),
       ( 6,   9, 10005470073, 1991, 60.067, 14, 21, 12,  7),
       ( 7,   5, 10009563271, 1996, 60.335, 11,  9,  6,  5),
       ( 8,   7, 10009404738, 1997, 58.127, 36,  8,  9,  8),
       ( 9,  64, 10070590924, 2001, 57.878, 40, 14, 15,  9),
       (10,   1, 10011016756, 1999, 61.714,  2, 15, 11, 10),
       (11,   1, 10010038167, 1992, 61.155,  4, 23, 21, 12),
       (12,   3, 10008723112, 1996, 59.081, 21,  3,  2, 14),
       (13,  30, 10023896336, 2000, 58.065, 38, 34, 24, 18),
       (14,   1, 10008831529, 1995, 58.315, 33, 19, 17, 16),
       (15,   2, 10011213584, 1999, 59.276, 20, 11, 13, 13),
       (16,   1, 10002818640, 1981, 58.315, 33, 13, 19, 17),
       (17, 109, 1001601

In [214]:
# Load only the header of the columns containing numeric data into a seperate variable
header_numeric = np.genfromtxt('2022 Lourdes Mens Elite Times.csv',
                               delimiter = ',',
                               skip_footer = race_data_22.shape[0],
                               usecols = column_numeric,
                               dtype = str)
header_numeric

array(['Position', 'World Ranking', 'UCI Number', 'YOB', 'Speed',
       'Speed Ranking', 'Split 1 pos', 'Split 2 pos', 'Split 3 pos',
       'Split 4 pos', 'TOW', 'Points'], dtype='<U13')

In [215]:
# The split time data is currently stored as a string due to it's time format (2:40.511) This will need to be converted into milliseconds for analysis
race_data_string_22

array([['IERRON Amaury', 'COMMENCAL/MUC-OFF BY RIDING', 'FRA',
        '0:44.269', '1:17.360', '1:53.519', '2:20.293', '02:47.0'],
       ['ILES Finn', 'SPECIALIZED GRAVITY', 'CAN', '0:44.924',
        '1:16.398', '1:53.573', '2:20.354', '02:48.0'],
       ['BRUNI Loic', 'SPECIALIZED GRAVITY', 'FRA', '0:45.023',
        '1:16.676', '1:53.788', '2:20.960', '02:48.0'],
       ['COULANGES Benoit', 'DORVAL AM COMMENCAL', 'FRA', '0:45.302',
        '1:16.893', '1:54.638', '2:21.349', '02:49.0'],
       ['SHAW Luca', 'CANYON COLLECTIVE FACTORY TEAM', 'USA', '0:45.223',
        '1:16.795', '1:54.231', '2:21.860', '02:50.0'],
       ['HART Danny', 'CUBE FACTORY RACING', 'GBR', '0:46.511',
        '1:18.379', '1:54.979', '2:22.370', '02:50.0'],
       ['WILSON Reece', 'TREK FACTORY RACING DH', 'GBR', '0:45.689',
        '1:17.073', '1:54.377', '2:22.392', '02:51.0'],
       ['GREENLAND Laurie', 'SANTA CRUZ SYNDICATE', 'GBR', '0:45.453',
        '1:18.220', '1:56.086', '2:23.325', '02:51.0'],
  

In [216]:
# Load the columns containing the split time data into a new variable as a string
split_times_strings = np.genfromtxt('2022 Lourdes Mens Elite Times.csv',
                              delimiter = ',',
                              skip_header = 1,
                              usecols = [9,11,13,15],
                              autostrip = True,
                              dtype = str)


In [217]:
# Function that converts split time strings to milliseconds as an integer
def string_split_to_int(value):
    # Split the string by the seperator
    minutes_str, seconds_str = value.split(':')
    seconds_str, milliseconds_str = seconds_str.split('.')
    
    # Convert to integer
    minutes = int(minutes_str)
    seconds = int(seconds_str)
    milliseconds = int(milliseconds_str)
        
    # Calculate the total milliseconds
    total_milliseconds = (minutes * 60 + seconds) * 1000 + milliseconds

    return(total_milliseconds) 

In [218]:
# Applies string_split_to_int function to all values in split_times_strings
conversion_func = np.vectorize(string_split_to_int)
split_times_int = np.array(conversion_func(split_times_strings))
split_times_int.shape

(60, 4)

In [259]:
race_data_numeric_22

array([( 1,   6, 10008827283, 1996, 60.335, 11,  1,  7,  1),
       ( 2,  14, 10090907774, 1999, 59.081, 21,  2,  1,  2),
       ( 3,   1, 10007544358, 1994, 58.824, 25,  4,  3,  3),
       ( 4,   8, 10008194359, 1994, 59.081, 21,  7,  5,  6),
       ( 5,  17, 10008813442, 1996, 61.224,  3,  6,  4,  4),
       ( 6,   9, 10005470073, 1991, 60.067, 14, 21, 12,  7),
       ( 7,   5, 10009563271, 1996, 60.335, 11,  9,  6,  5),
       ( 8,   7, 10009404738, 1997, 58.127, 36,  8,  9,  8),
       ( 9,  64, 10070590924, 2001, 57.878, 40, 14, 15,  9),
       (10,   1, 10011016756, 1999, 61.714,  2, 15, 11, 10),
       (11,   1, 10010038167, 1992, 61.155,  4, 23, 21, 12),
       (12,   3, 10008723112, 1996, 59.081, 21,  3,  2, 14),
       (13,  30, 10023896336, 2000, 58.065, 38, 34, 24, 18),
       (14,   1, 10008831529, 1995, 58.315, 33, 19, 17, 16),
       (15,   2, 10011213584, 1999, 59.276, 20, 11, 13, 13),
       (16,   1, 10002818640, 1981, 58.315, 33, 13, 19, 17),
       (17, 109, 1001601

In [220]:
split_times_int[0,:]

array([ 44269,  77360, 113519, 140293])

In [221]:
np.reshape(split_times_int, (60, 4))

array([[ 44269,  77360, 113519, 140293],
       [ 44924,  76398, 113573, 140354],
       [ 45023,  76676, 113788, 140960],
       [ 45302,  76893, 114638, 141349],
       [ 45223,  76795, 114231, 141860],
       [ 46511,  78379, 114979, 142370],
       [ 45689,  77073, 114377, 142392],
       [ 45453,  78220, 116086, 143325],
       [ 46027,  78853, 116199, 143371],
       [ 46072,  78374, 116357, 143966],
       [ 46652,  79716, 117219, 144453],
       [ 45003,  76399, 117561, 144706],
       [ 47053,  79796, 118072, 146002],
       [ 46381,  79222, 117803, 145928],
       [ 45839,  78516, 117233, 145913],
       [ 45994,  79277, 117850, 146252],
       [ 46739,  79186, 116962, 146290],
       [ 46632,  79852, 118839, 146901],
       [ 47705,  81562, 119479, 147393],
       [ 46435,  78632, 117740, 146432],
       [ 46984,  80048, 118301, 146836],
       [ 46994,  79757, 118658, 146971],
       [ 46661,  79947, 118645, 146911],
       [ 47016,  79744, 118813, 147133],
       [ 45933, 

In [252]:
array = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [253]:
array

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [257]:
array = np.append(array, ([1],[1],[1]) )

In [258]:
array

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 1])

In [7]:
race_times_raw_data = pd.read_csv('2022 Lourdes Mens Elite Times.csv', encoding='windows-1252')

In [5]:
with open('2022 Lourdes Mens Elite Times.csv', 'rb') as file:
    result = chardet.detect(file.read())
    
print(result['encoding'])

Windows-1252
