# Pre-processing BMI Data

In this example, we will process height/weight data for MLB players using Gota, demonstrating how little code this takes. We will remove unnecessary columns, normalise columns, and convert data types in a few lines of code.

In [3]:
import (
    "fmt"
    "github.com/kniren/gota/dataframe"
    "github.com/kniren/gota/series"
    "io/ioutil"
    "bytes"
    "math/rand"
)

In [4]:
const path = "../datasets/bmi/SOCR_Data_MLB_HeightsWeights.csv"

In [5]:
b, err := ioutil.ReadFile(path)
if err != nil {
    fmt.Println("Error!", err)
}
df := dataframe.ReadCSV(bytes.NewReader(b))

In [6]:
df

[1034x6] DataFrame

    Name            Team     Position       Height(inches) Weight(pounds) ...
 0: Adam_Donachie   BAL      Catcher        74             180            ...
 1: Paul_Bako       BAL      Catcher        74             215            ...
 2: Ramon_Hernandez BAL      Catcher        72             210            ...
 3: Kevin_Millar    BAL      First_Baseman  72             210            ...
 4: Chris_Gomez     BAL      First_Baseman  73             188            ...
 5: Brian_Roberts   BAL      Second_Baseman 69             176            ...
 6: Miguel_Tejada   BAL      Shortstop      69             209            ...
 7: Melvin_Mora     BAL      Third_Baseman  71             200            ...
 8: Aubrey_Huff     BAL      Third_Baseman  76             231            ...
 9: Adam_Stern      BAL      Outfielder     71             180            ...
    ...             ...      ...            ...            ...            ...
    <string>        <string> <string>       

In [7]:
df = df.Select([]string{"Position", "Height(inches)", "Weight(pounds)", "Age"})
df = df.Rename("Height","Height(inches)")
df = df.Rename("Weight","Weight(pounds)")

In [8]:
df

[1034x4] DataFrame

    Position       Height Weight Age      
 0: Catcher        74     180    22.990000
 1: Catcher        74     215    34.690000
 2: Catcher        72     210    30.780000
 3: First_Baseman  72     210    35.430000
 4: First_Baseman  73     188    35.710000
 5: Second_Baseman 69     176    29.390000
 6: Shortstop      69     209    30.770000
 7: Third_Baseman  71     200    35.070000
 8: Third_Baseman  76     231    30.190000
 9: Outfielder     71     180    27.050000
    ...            ...    ...    ...      
    <string>       <int>  <int>  <float>  


In [9]:
df = df.Mutate(series.New(df.Col("Height"), series.Float, "Height"))
df = df.Mutate(series.New(df.Col("Weight"), series.Float, "Weight"))

ERROR: reflect.Value.Convert: value of type reflect.Value cannot be converted to type series.Type

In [10]:
df = df.Filter(dataframe.F{"Weight", "<", 260})

ERROR: reflect.Value.Convert: value of type reflect.Value cannot be converted to type series.Comparator

In [11]:
df.Col("Height").Min()

67

In [12]:
//  rescale maps the given column values onto the range [0,1]
func rescale(df dataframe.DataFrame, col string) dataframe.DataFrame {
    s := df.Col(col)
    min := s.Min()
    max := s.Max()
    v := make([]float64, s.Len(), s.Len())
    for i := 0; i < s.Len(); i++ {
        v[i] = (s.Elem(i).Float()-min)/(max-min)
    }
    rs := series.Floats(v)
    rs.Name = col
    return df.Mutate(rs)
}

//  meanNormalise maps the given column values onto the range [-1,1] by subtracting mean and dividing by max - min
func meanNormalise(df dataframe.DataFrame, col string) dataframe.DataFrame {
    s := df.Col(col)
    min := s.Min()
    max := s.Max()
    mean := s.Mean()
    v := make([]float64, s.Len(), s.Len())
    for i := 0; i < s.Len(); i++ {
        v[i] = (s.Elem(i).Float()-mean)/(max-min)
    }
    rs := series.Floats(v)
    rs.Name = col
    return df.Mutate(rs)
}

//  meanNormalise maps the given column values onto the range [-1,1] by subtracting mean and dividing by max - min
func standardise(df dataframe.DataFrame, col string) dataframe.DataFrame {
    s := df.Col(col)
    std := s.StdDev()
    mean := s.Mean()
    v := make([]float64, s.Len(), s.Len())
    for i := 0; i < s.Len(); i++ {
        v[i] = (s.Elem(i).Float()-mean)/std
    }
    rs := series.Floats(v)
    rs.Name = col
    return df.Mutate(rs)
}

In [13]:
df = rescale(df, "Height")
df = rescale(df, "Weight")

In [14]:
df

[1034x4] DataFrame

    Position       Height   Weight   Age      
 0: Catcher        0.437500 0.214286 22.990000
 1: Catcher        0.437500 0.464286 34.690000
 2: Catcher        0.312500 0.428571 30.780000
 3: First_Baseman  0.312500 0.428571 35.430000
 4: First_Baseman  0.375000 0.271429 35.710000
 5: Second_Baseman 0.125000 0.185714 29.390000
 6: Shortstop      0.125000 0.421429 30.770000
 7: Third_Baseman  0.250000 0.357143 35.070000
 8: Third_Baseman  0.562500 0.578571 30.190000
 9: Outfielder     0.250000 0.214286 27.050000
    ...            ...      ...      ...      
    <string>       <float>  <float>  <float>  


In [15]:
perm := rand.Perm(df.Nrow())

In [16]:
df.Subset(perm[0:0.7*len(perm)])

ERROR: untyped constant {float64 7/10} overflows <int>

In [17]:
int(0.7*float64(len(perm)))

723

In [20]:
//  split splits the dataframe into training and validation subsets. valFraction (0 <= valFraction <= 1) of the samples
//  are reserved for validation and the rest are for training. 
func split(df dataframe.DataFrame, valFraction float64) (training dataframe.DataFrame, validation dataframe.DataFrame){
    perm := rand.Perm(df.Nrow())
    cutoff := int(valFraction*float64(len(perm)))
    training = df.Subset(perm[:cutoff])
    validation = df.Subset(perm[cutoff:len(perm)])
    return training, validation
}

In [21]:
split(df, 0.7)

[723x4] DataFrame

    Position       Height   Weight   Age      
 0: Third_Baseman  0.250000 0.500000 27.900000
 1: Relief_Pitcher 0.375000 0.214286 26.560000
 2: Relief_Pitcher 0.437500 0.428571 24.490000
 3: Relief_Pitcher 0.250000 0.142857 26.430000
 4: Outfielder     0.500000 0.428571 26.670000
 5: Catcher        0.375000 0.357143 34.070000
 6: Relief_Pitcher 0.375000 0.221429 29.040000
 7: First_Baseman  0.500000 0.528571 28.450000
 8: Catcher        0.375000 0.192857 30.420000
 9: Relief_Pitcher 0.375000 0.392857 25.650000
    ...            ...      ...      ...      
    <string>       <float>  <float>  <float>  
 [311x4] DataFrame

    Position         Height   Weight   Age      
 0: Relief_Pitcher   0.562500 0.428571 25.890000
 1: Outfielder       0.125000 0.214286 27.550000
 2: First_Baseman    0.500000 0.500000 26.890000
 3: Relief_Pitcher   0.625000 0.535714 29.710000
 4: Starting_Pitcher 0.562500 0.450000 31.440000
 5: Outfielder       0.250000 0.357143 24.770000
 6: Rel

In [76]:
df.Col("Position")

[Catcher Catcher Catcher First_Baseman First_Baseman Second_Baseman Shortstop Third_Baseman Third_Baseman Outfielder Outfielder Outfielder Outfielder Outfielder Outfielder Outfielder Designated_Hitter Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Catcher Catcher First_Baseman Second_Baseman Shortstop Shortstop Third_Baseman Third_Baseman Outfielder Outfielder Outfielder Outfielder Outfielder Outfielder Outfielder Outfielder Designated_Hitter Designated_Hitter Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Starting_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Relief_Pitcher Catcher Catcher Catcher First_Baseman First_Baseman First_B

In [79]:
func UniqueValues(df dataframe.DataFrame, col string) []string {
    var ret []string
    m := make(map[string]bool)
    for _, val := range df.Col(col).Records() {
        m[val] = true
    }
    for key := range m {
        ret = append(ret, key)
    }
    return ret
}

In [80]:
UniqueValues(df, "Position")

[Shortstop Outfielder Starting_Pitcher Relief_Pitcher Second_Baseman First_Baseman Third_Baseman Designated_Hitter Catcher]

In [111]:
func OneHotSeries(df dataframe.DataFrame, col string, vals []string) ([]series.Series){
    m := make(map[string]int)
    s := make([]series.Series, len(vals), len(vals))
    //cache the mapping for performance reasons
    for i := range vals {
        m[vals[i]] = i
    }
    for i := range s {
        vals := make([]int, df.Col(col).Len(),df.Col(col).Len())
        for j, val := range df.Col(col).Records() {
            if i == m[val] {
                vals[j] = 1
            }
        }
        s[i] = series.Ints(vals)
    }
    for i := range vals {
        s[i].Name = vals[i]
    }
    return s
}

In [112]:
ohSeries := OneHotSeries(df, "Position", UniqueValues(df, "Position"))
dfEncoded := df.Mutate(ohSeries[0])
for i := 1; i < len(ohSeries); i++ {
    dfEncoded = dfEncoded.Mutate(ohSeries[i])
}

In [115]:
dfEncoded

[1034x13] DataFrame

    Position       Height   Weight   Age       Shortstop Catcher ...
 0: Catcher        0.437500 0.214286 22.990000 0         1       ...
 1: Catcher        0.437500 0.464286 34.690000 0         1       ...
 2: Catcher        0.312500 0.428571 30.780000 0         1       ...
 3: First_Baseman  0.312500 0.428571 35.430000 0         0       ...
 4: First_Baseman  0.375000 0.271429 35.710000 0         0       ...
 5: Second_Baseman 0.125000 0.185714 29.390000 0         0       ...
 6: Shortstop      0.125000 0.421429 30.770000 1         0       ...
 7: Third_Baseman  0.250000 0.357143 35.070000 0         0       ...
 8: Third_Baseman  0.562500 0.578571 30.190000 0         0       ...
 9: Outfielder     0.250000 0.214286 27.050000 0         0       ...
    ...            ...      ...      ...       ...       ...     ...
    <string>       <float>  <float>  <float>   <int>     <int>   ...

Not Showing: Second_Baseman <int>, Outfielder <int>, Designated_Hitter <int>,
Sta

In [118]:
dfEncoded = dfEncoded.Drop("Position")

In [119]:
dfEncoded

[1034x12] DataFrame

    Height   Weight   Age       Shortstop Catcher Second_Baseman Outfielder ...
 0: 0.437500 0.214286 22.990000 0         1       0              0          ...
 1: 0.437500 0.464286 34.690000 0         1       0              0          ...
 2: 0.312500 0.428571 30.780000 0         1       0              0          ...
 3: 0.312500 0.428571 35.430000 0         0       0              0          ...
 4: 0.375000 0.271429 35.710000 0         0       0              0          ...
 5: 0.125000 0.185714 29.390000 0         0       1              0          ...
 6: 0.125000 0.421429 30.770000 1         0       0              0          ...
 7: 0.250000 0.357143 35.070000 0         0       0              0          ...
 8: 0.562500 0.578571 30.190000 0         0       0              0          ...
 9: 0.250000 0.214286 27.050000 0         0       0              1          ...
    ...      ...      ...       ...       ...     ...            ...        ...
    <float>  <float