# Creating and summarizing a correlation matrix with daru and statsample

## This notebook also serves as a demostration of Daru.lazy_update and 

In [12]:
require 'statsample'

Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
  
  # It so happens that Daru::Vector and Daru::DataFrame must update metadata
  # like positions of missing values every time they are created. 
  #
  # Since we dont have any missing values in the data that we are creating, 
  # we set Daru.lazy_update = true so that missing data is not updated every
  # time and things happen much faster.
  #
  # In case you do have missing data and lazy_update has been set to *true*, 
  # you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
  # everytime an assingment or deletion cycle is complete.
  Daru.lazy_update = true
  
  # Create a Daru::DataFrame containing 4 vectors a, b, c and d.
  #
  # Notice that the `clone` option has been set to *false*. This tells Daru
  # to not clone the Daru::Vectors being supplied by `rnorm`, since it would
  # be unnecessarily counter productive to clone the vectors once they have
  # been assigned to the dataframe.
  samples = 1000
  ds = Daru::DataFrame.new({
    :a => rnorm(samples),
    :b => rnorm(samples),
    :c => rnorm(samples),
    :d => rnorm(samples)
  }, clone: false)
  
  
  puts "== DataFrame ==\n"
  IRuby.display ds.head
  
  # Calculate correlation matrix by calling the `cor` shorthand.
  cm = Statsample::Bivariate.correlation_matrix(ds)
  
  puts "\n== Correlation Matrix ==\n"
  IRuby.display cm
  
  # Set lazy_update to *false* once our job is done so that this analysis does
  # not accidentally affect code elsewhere.
  Daru.lazy_update = false
end

Statsample::Analysis.run_batch

"== DataFrame ==\n"

Unnamed: 0,a,b,c,d
0,-2.440821265161855,-0.2663867705446803,-0.9667049163587536,-0.6247662665284197
1,-0.9973263526986712,0.8965870460177997,-0.4924093636219239,-1.0325364294957489
2,-0.2744289410160191,1.808868753188392,-0.5910605176882341,1.3789993200304744
3,1.8688690135894492,-0.5845393024341371,2.140052050767279,-0.1187944782672861
4,1.0841575504132723,0.2981946173486942,0.5801339485682966,1.935529360750203
5,0.8365312603239075,-2.592250495057871,-0.5377005060865632,-1.3156474227148434
6,-0.6143096811863075,0.9303368054803663,-0.2762842748435172,0.695374402222615
7,0.9288749463298932,0.1734287561052509,-0.1894849397856298,0.5439278783192012
8,-1.231559058162765,-0.5070225404828463,0.7560851316374544,-0.5377282599496545
9,-0.8552039210385985,0.1061517520214962,-1.285999135739217,-0.7772438525282614


"\n== Correlation Matrix ==\n"

Matrix[[1.0, -0.07093352677886748, 0.013477672801161788, 0.051849860816432954], [-0.07093352677886748, 1.0, 0.010458169946423295, 0.0163511125525614], [0.013477672801161788, 0.010458169946423295, 1.0, 0.01824910238325418], [0.051849860816432954, 0.0163511125525614, 0.01824910238325418, 1.0]]

Analysis 2015-06-03 15:17:33 +0530
= Statsample::Bivariate.correlation_matrix

