In [1]:
# imports

## allow interaction with the plots
%matplotlib widget

%load_ext autoreload
%autoreload 2

# add the main directory reference and import 'imports.py'
import sys

if ".." not in sys.path:
    sys.path.insert(0, "..")
from imports import *

logger = logging.getLogger()
logger.setLevel(level=logging.WARN)

In [2]:
# Load dataset

# imports
import numpy as np
import polars as pl
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import polars.selectors as cs
from lib.dfs import *


# Loading Dataset
# 0 values all null
df = pl.read_csv(
    "resources/Cell viability and extrusion dataset V1.csv",
    has_header=True,
    null_values=[""],
    # schema=dtype,
    infer_schema_length=1000,
)

# `297.22 (day unknown)` in 'Fiber_Diameter_(µm)' (column number 33) so we have to treat as string and clean it
df = df.with_columns(pl.col("Fiber_Diameter_(µm)").custom.clean_numeric())

print(df.head())
print(df.glimpse())


shape: (5, 51)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Reference ┆ DOI       ┆ Final_Alg ┆ Final_Gel ┆ … ┆ Primary/N ┆ Viability ┆ Acceptabl ┆ Acceptab │
│ ---       ┆ ---       ┆ inate_Con ┆ atin_Conc ┆   ┆ ot_Primar ┆ _at_time_ ┆ e_Viabili ┆ le_Press │
│ str       ┆ str       ┆ c_(%w/v)  ┆ _(%w/v)   ┆   ┆ y         ┆ of_observ ┆ ty_(Yes/N ┆ ure_(Yes │
│           ┆           ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ati…      ┆ o)        ┆ /No)     │
│           ┆           ┆ f64       ┆ f64       ┆   ┆ str       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆ f64       ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Fedorovic ┆ 10.1089/t ┆ 2.0       ┆ 0.0       ┆ … ┆ Primary   ┆ 96.0      ┆ Y         ┆ Y        │
│ h et. al. ┆ en.a.2007 ┆           ┆           ┆   ┆           ┆           

In [3]:
# drop nulls
transformer = ColumnTransformer(
    transformers=[
        (
            "num",
            SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
            ["Syringe_Temperature_(°C)", "Substrate_Temperature_(°C)"],
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)
transformer.set_output(transform="polars")

df = transformer.fit_transform(df)

# Drop "effect" variables
# Since we don't have the same dataset, we try to follow the spirit instead of the letter of https://scholarscompass.vcu.edu/cgi/viewcontent.cgi?article=7979&context=etd
# The paper only mentions dropping Fiber diameter for extrusion but since it's an "effect" variable, we decided to drop it as well
df = df.drop(
    [
        "Fiber_Diameter_(µm)",
        "Reference",
        "DOI",
        "Acceptable_Viability_(Yes/No)",
        "Acceptable_Pressure_(Yes/No)",
        "Final_PEGTA_Conc_(%w/v)",
        "Final_PEGMA_Conc_(%w/v)",
    ]
)

df = drop_cols(
    df,
    drop_criterion=lambda col: sum((1 if x is None else 0 for x in col))
    >= df.shape[0] // 2,
)

df = drop_rows(
    df,
    count_null=lambda row: 1
    if row["Viability_at_time_of_observation_(%)"] in [None, 0]
    else 0,
)


print(df.null_count().glimpse())
df.shape


Rows: 1
Columns: 40
$ Syringe_Temperature_(°C)              <u32> 0
$ Substrate_Temperature_(°C)            <u32> 0
$ Final_Alginate_Conc_(%w/v)            <u32> 0
$ Final_Gelatin_Conc_(%w/v)             <u32> 0
$ Final_GelMA_Conc_(%w/v)               <u32> 0
$ Final_Hyaluronic_Acid_Conc_(%w/v)     <u32> 0
$ Final_MeHA_Conc_(%w/v)                <u32> 0
$ Final_NorHA_Conc_(%w/v)               <u32> 0
$ Final_Fibroin/Fibrinogen_Conc_(%w/v)  <u32> 0
$ Final_P127_Conc_(%w/v)                <u32> 0
$ Final_Collagen_Conc_(%w/v)            <u32> 1
$ Final_Chitosan_Conc_(%w/v)            <u32> 0
$ Final_CS-AEMA_Conc_(%w/v)             <u32> 0
$ Final_TCP_Conc_(%w/v)                 <u32> 0
$ Final_Gellan_Conc_(%w/v)              <u32> 0
$ Final_Nano/Methycellulose_Conc_(%w/v) <u32> 0
$ Final_PEGDA_Conc_(%w/v)               <u32> 0
$ Final_Agarose_Conc_(%w/v)             <u32> 0
$ CaCl2_Conc_(mM)                       <u32> 34
$ NaCl2_Conc_(mM)                       <u32> 4
$ BaCl2_Conc_(mM)  

(617, 40)

In [4]:
# Imputing Values

# Threshold to binary val
target = ["Viability_at_time_of_observation_(%)"]

y = df.select(
    pl.when(pl.col("Viability_at_time_of_observation_(%)") > 70)
    .then(1)
    .otherwise(0)
    .alias("Viability_at_time_of_observation_(%)")
)
df = df.drop(target)

# Numeric KNN
numeric_cols = df.select(cs.numeric()).columns
transformer = ColumnTransformer(
    transformers=[
        (
            "num",
            KNNImputer(n_neighbors=30, weights="uniform"),
            numeric_cols,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)
transformer.set_output(transform="polars")
df = transformer.fit_transform(df)

# categorical mode impute + one hot
df = process_categoricals(df)

df.null_count().glimpse()

Rows: 1
Columns: 39
$ Syringe_Temperature_(°C)              <u32> 0
$ Substrate_Temperature_(°C)            <u32> 0
$ Final_Alginate_Conc_(%w/v)            <u32> 0
$ Final_Gelatin_Conc_(%w/v)             <u32> 0
$ Final_GelMA_Conc_(%w/v)               <u32> 0
$ Final_Hyaluronic_Acid_Conc_(%w/v)     <u32> 0
$ Final_MeHA_Conc_(%w/v)                <u32> 0
$ Final_NorHA_Conc_(%w/v)               <u32> 0
$ Final_Fibroin/Fibrinogen_Conc_(%w/v)  <u32> 0
$ Final_P127_Conc_(%w/v)                <u32> 0
$ Final_Collagen_Conc_(%w/v)            <u32> 0
$ Final_Chitosan_Conc_(%w/v)            <u32> 0
$ Final_CS-AEMA_Conc_(%w/v)             <u32> 0
$ Final_TCP_Conc_(%w/v)                 <u32> 0
$ Final_Gellan_Conc_(%w/v)              <u32> 0
$ Final_Nano/Methycellulose_Conc_(%w/v) <u32> 0
$ Final_PEGDA_Conc_(%w/v)               <u32> 0
$ Final_Agarose_Conc_(%w/v)             <u32> 0
$ CaCl2_Conc_(mM)                       <u32> 0
$ NaCl2_Conc_(mM)                       <u32> 0
$ BaCl2_Conc_(mM)   

In [5]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.15, random_state=42
)

X_train.glimpse(), y_train.glimpse()

Rows: 524
Columns: 39
$ Syringe_Temperature_(°C)              <f64> 10.0, 22.0, 22.0, 22.0, 22.0, 22.0, 37.0, 22.0, 37.0, 22.0
$ Substrate_Temperature_(°C)            <f64> 10.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0
$ Final_Alginate_Conc_(%w/v)            <f64> 8.0, 0.0, 4.0, 3.0, 3.0, 1.0, 3.0, 3.25, 0.0, 4.0
$ Final_Gelatin_Conc_(%w/v)             <f64> 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 4.0, 0.0, 0.0
$ Final_GelMA_Conc_(%w/v)               <f64> 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 6.0
$ Final_Hyaluronic_Acid_Conc_(%w/v)     <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
$ Final_MeHA_Conc_(%w/v)                <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
$ Final_NorHA_Conc_(%w/v)               <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
$ Final_Fibroin/Fibrinogen_Conc_(%w/v)  <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0
$ Final_P127_Conc_(%w/v)                <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
$ Fina

(None, None)