# Create a Linear Regression

In this example we will load California house price data to a gota dataframe and perform a linear regression to predict the median house price based on other variables such as the median income. 

In [2]:
import (
    "fmt"
    "github.com/kniren/gota/dataframe"
    "github.com/kniren/gota/series"
    "math/rand"
    "image"
    "bytes"
    "math"
    "github.com/gonum/stat"
    "github.com/gonum/integrate"
    "github.com/sajari/regression"
    "io/ioutil"
)

In [3]:
const path = "../datasets/housing/CaliforniaHousing/cal_housing.data"

In [4]:
columns := []string{"longitude", "latitude", "housingMedianAge", "totalRooms", "totalBedrooms", "population", "households", "medianIncome", "medianHouseValue"}
b, err := ioutil.ReadFile(path)
if err != nil {
    fmt.Println("Error!", err)
}
df := dataframe.ReadCSV(bytes.NewReader(b), dataframe.Names(columns...))

In [5]:
// Divide divides two series and returns a series with the given name. The series must have the same length.
func Divide(s1 series.Series, s2 series.Series, name string) series.Series {
    if s1.Len() != s2.Len() {
        panic("Series must have the same length!")
    }
    
    ret := make([]interface{}, s1.Len(), s1.Len())
    for i := 0; i < s1.Len(); i ++ {
        ret[i] = s1.Elem(i).Float()/s2.Elem(i).Float()
    }
    s := series.Floats(ret)
    s.Name = name
    return s
}

//  MultiplyConst multiplies the series by a constant and returns another series with the same name.
func MultiplyConst(s series.Series, f float64) series.Series {
    ret := make([]interface{}, s.Len(), s.Len())
    for i := 0; i < s.Len(); i ++ {
        ret[i] = s.Elem(i).Float()*f
    }
    ss := series.Floats(ret)
    ss.Name = s.Name
    return ss
}

In [6]:
df

[20639x9] DataFrame

    longitude   latitude  housingMedianAge totalRooms  totalBedrooms ...
 0: -122.220000 37.860000 21.000000        7099.000000 1106.000000   ...
 1: -122.240000 37.850000 52.000000        1467.000000 190.000000    ...
 2: -122.250000 37.850000 52.000000        1274.000000 235.000000    ...
 3: -122.250000 37.850000 52.000000        1627.000000 280.000000    ...
 4: -122.250000 37.850000 52.000000        919.000000  213.000000    ...
 5: -122.250000 37.840000 52.000000        2535.000000 489.000000    ...
 6: -122.250000 37.840000 52.000000        3104.000000 687.000000    ...
 7: -122.260000 37.840000 42.000000        2555.000000 665.000000    ...
 8: -122.250000 37.840000 52.000000        3549.000000 707.000000    ...
 9: -122.260000 37.850000 52.000000        2202.000000 434.000000    ...
    ...         ...       ...              ...         ...           ...
    <float>     <float>   <float>          <float>     <float>       ...

Not Showing: population <floa

In [7]:
df = df.Mutate(Divide(df.Col("totalRooms"), df.Col("households"), "averageRooms"))
df = df.Mutate(Divide(df.Col("totalBedrooms"), df.Col("households"), "averageBedrooms"))
df = df.Mutate(Divide(df.Col("population"), df.Col("households"), "averageOccupancy"))
df = df.Mutate(MultiplyConst(df.Col("medianHouseValue"), 0.00001))
df = df.Select([]string{"medianIncome", "housingMedianAge", "averageRooms", "averageBedrooms", "population", "averageOccupancy", "latitude", "longitude", "medianHouseValue" })

In [8]:
df

[20639x9] DataFrame

    medianIncome housingMedianAge averageRooms averageBedrooms population  ...
 0: 8.301400     21.000000        6.238137     0.971880        2401.000000 ...
 1: 7.257400     52.000000        8.288136     1.073446        496.000000  ...
 2: 5.643100     52.000000        5.817352     1.073059        558.000000  ...
 3: 3.846200     52.000000        6.281853     1.081081        565.000000  ...
 4: 4.036800     52.000000        4.761658     1.103627        413.000000  ...
 5: 3.659100     52.000000        4.931907     0.951362        1094.000000 ...
 6: 3.120000     52.000000        4.797527     1.061824        1157.000000 ...
 7: 2.080400     42.000000        4.294118     1.117647        1206.000000 ...
 8: 3.691200     52.000000        4.970588     0.990196        1551.000000 ...
 9: 3.203100     52.000000        5.477612     1.079602        910.000000  ...
    ...          ...              ...          ...             ...         ...
    <float>      <float>       

In [9]:
func Split(df dataframe.DataFrame, valFraction float64) (training dataframe.DataFrame, validation dataframe.DataFrame){
    perm := rand.Perm(df.Nrow())
    cutoff := int(valFraction*float64(len(perm)))
    training = df.Subset(perm[:cutoff])
    validation = df.Subset(perm[cutoff:])
    return training, validation
}

In [10]:
training, validation := Split(df, 0.75)

In [12]:
//  DataFrameToXYs converts a dataframe with float64 columns to a slice of independent variable columns as floats
//  and the dependent variable (yCol). This can then be used with eg. goml's linear ML algorithms.
//  yCol is optional - if it doesn't exist only the x (independent) variables will be returned.
func DataFrameToXYs(df dataframe.DataFrame, yCol string) ([][]float64, []float64){
    var (
        x [][]float64
        y []float64
        yColIx = -1
    )
    
    //find dependent variable column index
    for i, col := range df.Names() {
        if col == yCol {
            yColIx = i
            break
        }
    }
    if yColIx == -1 {
        fmt.Println("Warning - no dependent variable")
    }
    x = make([][]float64, df.Nrow(), df.Nrow())   
    y = make([]float64, df.Nrow())
    for i := 0; i < df.Nrow(); i++ {
        var xx []float64
        for j := 0; j < df.Ncol(); j ++ {
            if j == yColIx {
                y[i] = df.Elem(i, j).Float()
                continue
            }
            xx = append(xx, df.Elem(i,j).Float())
        }
        x[i] = xx        
    }
    return x, y
}

In [21]:
trainingX, trainingY := DataFrameToXYs(training, "medianHouseValue")
validationX, validationY := DataFrameToXYs(validation, "medianHouseValue")

## Linear Regression for Median House Price

In [33]:
model := new(regression.Regression)

In [34]:
for i := range trainingX {
    model.Train(regression.DataPoint(trainingY[i], trainingX[i]))
}
if err := model.Run(); err != nil {
    fmt.Println(err)
}

## Calculate the Mean Square Error

In [35]:
//On validation set
errors := make([]float64, len(validationX), len(validationX))
for i := range validationX {
    prediction, err := model.Predict(validationX[i])
    if err != nil {
        panic(fmt.Println("Prediction error", err))
    }
    errors[i] = (prediction - validationY[i])*(prediction - validationY[i])
}

fmt.Printf("MSE: %5.2f\n", stat.Mean(errors, nil))

MSE:  0.51


11 <nil>

In [36]:
// On training set
errors := make([]float64, len(trainingX), len(trainingX))
for i := range trainingX {
    prediction, err := model.Predict(trainingX[i])
    if err != nil {
        panic(fmt.Println("Prediction error", err))
    }
    errors[i] = (prediction - trainingY[i])*(prediction - trainingY[i])
}

fmt.Printf("MSE: %5.2f\n", stat.Mean(errors, nil))

MSE:  0.53


11 <nil>