In [28]:
# imports

## allow interaction with the plots
%matplotlib widget

%load_ext autoreload
%autoreload 2

# add the main directory reference and import 'imports.py'
import sys

if ".." not in sys.path:
    sys.path.insert(0, "..")
from imports import *

logger = logging.getLogger()
logger.setLevel(level=logging.WARN)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Load dataset

# imports
import numpy as np
import polars as pl
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import polars.selectors as cs
from lib.dfs import *


# Loading Dataset
df = pl.read_excel(
    "resources/climb_symlink.xlsx",
    # schema=dtype,
    infer_schema_length=50,
)
df = df.with_columns((~cs.numeric()).custom.clean_str())

print(df.head())
df.glimpse()


shape: (5, 21)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ partcipan ┆ year      ┆ climbing  ┆ climbing  ┆ … ┆ board     ┆ 20mm      ┆ power     ┆ vertical │
│ t         ┆ athlete   ┆ experienc ┆ frequency ┆   ┆ grade #   ┆ crimp     ┆ slap test ┆ jump     │
│ ---       ┆ was born  ┆ e         ┆ per week  ┆   ┆ ---       ┆ strength  ┆ (ratio of ┆ height   │
│ i64       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ f64       ┆ to        ┆ reac…     ┆ (cm)     │
│           ┆ i64       ┆ f64       ┆ f64       ┆   ┆           ┆ bodywei…  ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆ ---       ┆ f64       ┆ f64      │
│           ┆           ┆           ┆           ┆   ┆           ┆ f64       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 1         ┆ 2007      ┆ 4.0       ┆ 5.5       ┆ … ┆ 5.0       ┆ 0.778523  

In [None]:
# drop nulls
print(df.columns)
target = ["boulder grade #"]
# target = ["route grade # (IRCRA)"]
# target = ["boulder grade #", "route grade # (IRCRA)"]


df = drop_rows(
    df,
    count_null=lambda row: sum(1 if row[k] in [None] else 0 for k in target),
)

transformer = ColumnTransformer(
    transformers=[
        (
            "num",
            SimpleImputer(missing_values=np.nan, strategy="mean"),
            df.select(cs.numeric()).columns,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

transformer.set_output(transform="polars")

df = transformer.fit_transform(df)

print(df.null_count().glimpse())
df.shape


['partcipant', 'year athlete was born ', 'climbing experience', 'climbing frequency per week', 'sex', '20mm crimp strength absolute (KG)', 'power-slap test (distance in cm)', 'pull-ups (reps)', 'horizontl reach (cm)', 'vertical reach (cm)', 'vertical jump reach height (cm)', 'intermittent crimp hang test (seconds)', 'bodyweight (kg)', 'Gym', 'category', 'boulder grade #', 'route grade # (IRCRA)', 'board grade #', '20mm crimp strength to bodyweight ratio', 'power slap test (ratio of reach height to test distance)', 'vertical jump height (cm)']
Rows: 1
Columns: 21
$ partcipant                                         <u32> 0
$ year athlete was born                              <u32> 0
$ climbing experience                                <u32> 0
$ climbing frequency per week                        <u32> 0
$ 20mm crimp strength absolute (KG)                  <u32> 0
$ power-slap test (distance in cm)                   <u32> 0
$ pull-ups (reps)                                    <u32> 0
$ ho

(96, 21)

In [3]:
# Imputing Values
df.glimpse()
df = process_categoricals(df)

df.glimpse()

Rows: 96
Columns: 21
$ partcipant                                         <f64> 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0
$ year athlete was born                              <f64> 2007.0, 2006.0, 2006.0, 2005.0, 2010.0, 2008.0, 2008.0, 2007.0, 2006.0, 2006.0
$ climbing experience                                <f64> 4.0, 2.0, 5.0, 8.0, 2.0, 6.5, 8.0, 10.0, 10.0, 8.0
$ climbing frequency per week                        <f64> 5.5, 4.0, 3.5, 4.0, 3.5, 4.0, 3.5, 5.0, 4.0, 4.0
$ 20mm crimp strength absolute (KG)                  <f64> 46.4, 49.1, 31.071051999999998, 43.4994728, 39.2, 54.7, 39.3, 38.4, 44.3, 46.2
$ power-slap test (distance in cm)                   <f64> 76.2, 79.66537234042553, 50.8, 68.58, 78.74, 86.36, 80.01, 91.44, 81.28, 87.63
$ pull-ups (reps)                                    <f64> 12.961538461538462, 12.961538461538462, 2.0, 5.0, 21.0, 17.0, 21.0, 14.0, 13.0, 15.0
$ horizontl reach (cm)                               <f64> 172.0, 166.25304347826085, 161.0, 1