In [1]:
import Graphics.EasyPlot
import Database.HDBC
import Database.HDBC.Sqlite3
import Numeric.LinearAlgebra.Data
import Numeric.LinearAlgebra.HMatrix

In [2]:
db <- connectSqlite3 "../data/regression.sqlite3"

In [3]:
yearRaw <- quickQuery db "SELECT year FROM uspop" []

In [4]:
populationRaw <- quickQuery db "SELECT population FROM uspop" []

In [5]:
year = map (fromSql . head) yearRaw :: [Double]

In [6]:
population = map (fromSql . head) populationRaw :: [Double]

In [7]:
plot X11 [Data2D [Title "US Population"] [] (zip year population)]

True

In [8]:
average xs = sum xs / fromIntegral (length xs)

In [9]:
covariance xs ys = average $ zipWith (\xi yi -> (xi-average xs)*(yi-average ys)) xs ys

In [10]:
variance xs = covariance xs xs

In [11]:
linearRegression :: [Double] -> [Double] -> (Double, Double)
linearRegression xs ys = (gradient, intercept)
  where
    gradient = covariance xs ys / variance xs
    intercept = (average ys) - gradient * (average xs)

In [12]:
(gradient, intercept) = linearRegression year population

In [13]:
gradient

1359349.0321146245

In [14]:
intercept

-2.478829416756917e9

In [15]:
regressionLine = map (\x -> gradient * x + intercept) year

In [16]:
plot X11 [Data2D [Title "US Population"] [] (zip year population), Data2D [Style Lines, Title "US Population Est."] [] (zip year regressionLine)]

True

In [17]:
2016 * gradient + intercept

2.61618231986166e8

In [18]:
population

[3929214.0,5308483.0,7239881.0,9638453.0,1.286602e7,1.7069453e7,2.3191876e7,3.1443321e7,3.9818449e7,5.0155783e7,6.2947714e7,7.5994575e7,9.1972266e7,1.0571062e8,1.22775046e8,1.31669275e8,1.50697361e8,1.79323175e8,2.03302031e8,2.26545805e8,2.48709873e8,2.81421906e8,3.08745538e8]

In [19]:
stdev xs = sqrt (variance xs)

In [20]:
pearsonr xs ys = covariance xs ys / (stdev xs * stdev ys)

In [21]:
pearsonrsqrd xs ys = pearsonr xs ys ^ 2

In [22]:
pearsonr year population

0.9585941214046018

In [23]:
pearsonrsqrd year population

0.9189026895914604

In [24]:
logPopulation = map log population

In [25]:
(logGradient, logIntercept) = linearRegression year logPopulation

In [26]:
logGradient

1.9620975905436842e-2

In [27]:
logIntercept

-19.462473558322817

In [28]:
logRegressionLine = map (\x -> exp (logGradient * x + logIntercept)) year

In [29]:
plot X11 [Data2D [Title "US Population"] [] (zip year population), Data2D [Style Lines, Title "US Population Est."] [] (zip year logRegressionLine)]

True

In [30]:
pearsonr year logRegressionLine

0.8785406437938305

In [31]:
pearsonrsqrd year logRegressionLine

0.7718336627976781

In [32]:
exp (logGradient * 2016 + logIntercept)

5.326706464555088e8

In [33]:
power xs = map (\pow -> xs^pow)

In [34]:
power 2 [0..2]

[1,2,4]

In [35]:
ym = matrix 1 population

In [36]:
ym

(23><1)
 [    3929214.0
 ,    5308483.0
 ,    7239881.0
 ,    9638453.0
 ,   1.286602e7
 ,  1.7069453e7
 ,  2.3191876e7
 ,  3.1443321e7
 ,  3.9818449e7
 ,  5.0155783e7
 ,  6.2947714e7
 ,  7.5994575e7
 ,  9.1972266e7
 ,  1.0571062e8
 , 1.22775046e8
 , 1.31669275e8
 , 1.50697361e8
 , 1.79323175e8
 , 2.03302031e8
 , 2.26545805e8
 , 2.48709873e8
 , 2.81421906e8
 , 3.08745538e8 ]

In [37]:
xm = matrix 3 $ concatMap (\x -> power x [0..2]) year

In [38]:
xm

(23><3)
 [ 1.0, 1790.0, 3204100.0
 , 1.0, 1800.0, 3240000.0
 , 1.0, 1810.0, 3276100.0
 , 1.0, 1820.0, 3312400.0
 , 1.0, 1830.0, 3348900.0
 , 1.0, 1840.0, 3385600.0
 , 1.0, 1850.0, 3422500.0
 , 1.0, 1860.0, 3459600.0
 , 1.0, 1870.0, 3496900.0
 , 1.0, 1880.0, 3534400.0
 , 1.0, 1890.0, 3572100.0
 , 1.0, 1900.0, 3610000.0
 , 1.0, 1910.0, 3648100.0
 , 1.0, 1920.0, 3686400.0
 , 1.0, 1930.0, 3724900.0
 , 1.0, 1940.0, 3763600.0
 , 1.0, 1950.0, 3802500.0
 , 1.0, 1960.0, 3841600.0
 , 1.0, 1970.0, 3880900.0
 , 1.0, 1980.0, 3920400.0
 , 1.0, 1990.0, 3960100.0
 , 1.0, 2000.0, 4000000.0
 , 1.0, 2010.0, 4040100.0 ]

In [39]:
p = mul (inv (mul (tr xm) xm)) (mul (tr xm) ym)

In [40]:
p

(3><1)
 [    2.1987320480625e10
 , -2.4425920995040894e7
 ,     6785.597375553101 ]

In [41]:
c = atIndex p (0,0)

In [42]:
c

2.1987320480625e10

In [43]:
b = atIndex p (1,0)

In [44]:
b

-2.4425920995040894e7

In [45]:
a = atIndex p (2,0)

In [46]:
a

6785.597375553101

In [47]:
polyRegressionLine = map (\x -> a*x*x + b*x +c) year

In [52]:
plot X11 [Data2D [Title "US Population"] [] (zip year population), Data2D [Style Lines, Title "US Population Est."] [] (zip year polyRegressionLine)]

True

In [49]:
a*2016*2016 + b*2016 + c

3.230686017985039e8

In [50]:
pearsonr year polyRegressionLine

0.9590530711738275

In [51]:
pearsonrsqrd year polyRegressionLine

0.9197827933279508