In [1]:
!pip install xgboost




[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: C:\Users\rob\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Loading and Combining Dataata

In [15]:
# Folder containing CSV files
folder_path = 'raw_data/'

# List all files in the folder ending with '.csv'
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty list to store dataframes
dataframes = []

# Loop through each file and read its data into a pandas DataFrame
for file_path in file_paths:
    try:
        # Read CSV into a dataframe
        df = pd.read_csv(file_path, encoding='cp1252')
        # Drop rows with all NaN values
        df.dropna(how='all', inplace=True)
        dataframes.append(df)
    except UnicodeDecodeError:
        print(f"Error reading {file_path}. Check encoding or file contents.")

# Combine dataframes vertically (adding rows below)
combined_df = pd.concat(dataframes, axis=0, ignore_index=True)

combined_df.head(-10)


Unnamed: 0,Plats,År,Plac,Klass,#,Namn,Klubb,Märke/Anmälare,Varvtider,Varv,Tid
0,FMK Skövde,2023.0,1.0,Motion 40-49,1111.0,Magnus Edberg,Huskvarna MK,KTM,14:08.2 14:17.5 11:47.5 12:52.1,4.0,53:05.5
1,FMK Skövde,2023.0,2.0,Motion -39,518.0,Alexander Fält,Försvarsmaktens EK,Honda,14:38.8 14:31.4 12:13.6 12:21.2,4.0,53:45.2
2,FMK Skövde,2023.0,3.0,Motion -39,577.0,Måns Dalén,FMCK Skövde,KTM,14:20.4 14:53.3 12:03.4 12:34.3,4.0,53:51.5
3,FMK Skövde,2023.0,4.0,Ungdom E1,175.0,William Almén,SMK Värnamo,Husqvarna,14:26.6 14:50.0 12:07.0 12:42.5,4.0,54:06.2
4,FMK Skövde,2023.0,5.0,Motion 40-49,944.0,Niklas Strömberg,Götene MK,Honda,14:40.2 15:07.9 11:56.7 12:28.9,4.0,54:13.8
...,...,...,...,...,...,...,...,...,...,...,...
3433,Tibro MK,2023.0,47.0,Bredd,213.0,Björn Levin,Kinna MK,Husqvarna,18:39.1 18:19.7 18:30.7 18:26.4,4.0,13:56.1
3434,Tibro MK,2023.0,48.0,Bredd,332.0,Ulf Åström,Wäxjö MS,Sherco,19:03.6 19:05.7 19:10.9 18:58.2,4.0,16:18.5
3435,Tibro MK,2023.0,49.0,Dam,441.0,Hilda Sjöberg,Tibro MK,Honda,19:31.1 19:36.3 19:18.4 19:05.1,4.0,17:31.0
3436,Tibro MK,2023.0,50.0,Bredd,315.0,Jesper Thorsson,Kungsbacka MA,Beta,19:36.0 19:27.1 19:18.8 19:30.9,4.0,17:53.0


## Fix columns:
l split the Varvtider column into separate columns (Varv 1, Varv 2, and so on) based on the space delimiter and store the result in a new DataFrame 

In [16]:
# Split 'Varvtider' column into separate columns based on space as delimiter
split_varvtider = combined_df['Varvtider'].str.split(expand=True)

# Fill new columns with split lap times
for i, col in enumerate(split_varvtider.columns, start=1):
    combined_df[f'Varv {i}'] = split_varvtider[col]

# Drop the original 'Varvtider' column
combined_df = combined_df.drop(columns=['Varvtider', 'Märke/Anmälare'])

combined_df.head(-10)

Unnamed: 0,Plats,År,Plac,Klass,#,Namn,Klubb,Varv,Tid,Varv 1,Varv 2,Varv 3,Varv 4,Varv 5,Varv 6,Varv 7,Varv 8,Varv 9,Varv 10,Varv 11
0,FMK Skövde,2023.0,1.0,Motion 40-49,1111.0,Magnus Edberg,Huskvarna MK,4.0,53:05.5,14:08.2,14:17.5,11:47.5,12:52.1,,,,,,,
1,FMK Skövde,2023.0,2.0,Motion -39,518.0,Alexander Fält,Försvarsmaktens EK,4.0,53:45.2,14:38.8,14:31.4,12:13.6,12:21.2,,,,,,,
2,FMK Skövde,2023.0,3.0,Motion -39,577.0,Måns Dalén,FMCK Skövde,4.0,53:51.5,14:20.4,14:53.3,12:03.4,12:34.3,,,,,,,
3,FMK Skövde,2023.0,4.0,Ungdom E1,175.0,William Almén,SMK Värnamo,4.0,54:06.2,14:26.6,14:50.0,12:07.0,12:42.5,,,,,,,
4,FMK Skövde,2023.0,5.0,Motion 40-49,944.0,Niklas Strömberg,Götene MK,4.0,54:13.8,14:40.2,15:07.9,11:56.7,12:28.9,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3433,Tibro MK,2023.0,47.0,Bredd,213.0,Björn Levin,Kinna MK,4.0,13:56.1,18:39.1,18:19.7,18:30.7,18:26.4,,,,,,,
3434,Tibro MK,2023.0,48.0,Bredd,332.0,Ulf Åström,Wäxjö MS,4.0,16:18.5,19:03.6,19:05.7,19:10.9,18:58.2,,,,,,,
3435,Tibro MK,2023.0,49.0,Dam,441.0,Hilda Sjöberg,Tibro MK,4.0,17:31.0,19:31.1,19:36.3,19:18.4,19:05.1,,,,,,,
3436,Tibro MK,2023.0,50.0,Bredd,315.0,Jesper Thorsson,Kungsbacka MA,4.0,17:53.0,19:36.0,19:27.1,19:18.8,19:30.9,,,,,,,
