From 30c616af04d56391a5810b93df825d58017c39de Mon Sep 17 00:00:00 2001 From: Joshua-Dias-Barreto Date: Mon, 15 May 2023 11:34:32 +0530 Subject: [PATCH 1/2] Added describe method for numeric dataframes --- src/DataFrame-Tests/DataFrameTest.class.st | 20 ++++++++++ src/DataFrame/DataFrame.class.st | 45 ++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/src/DataFrame-Tests/DataFrameTest.class.st b/src/DataFrame-Tests/DataFrameTest.class.st index 5751ce49..dafa8ee6 100644 --- a/src/DataFrame-Tests/DataFrameTest.class.st +++ b/src/DataFrame-Tests/DataFrameTest.class.st @@ -1553,6 +1553,26 @@ DataFrameTest >> testDataTypesWithNil [ self assert: df dataTypes equals: expected ] +{ #category : #tests } +DataFrameTest >> testDescribe [ + + | dataFrame expected | + dataFrame := DataFrame + withRows: #( #( 1 1 ) #( 2 nil ) #( 3 1 ) ) + columnNames: #( 'A' 'B' ). + + expected := DataFrame withRows: + #( #( 3 2 1 1 1 2 3 3 SmallInteger ) + #( 2 1 0 1 1 1 1 1 SmallInteger ) ). + + expected rowNames: #( 'A' 'B' ). + expected columnNames: + #( 'count' 'mean' 'std' 'min' '25%' '50%' '75%' 'max' 'dtype' ). + expected at: 1 at: 9 put: SmallInteger. + expected at: 2 at: 9 put: SmallInteger. + self assert: dataFrame describe equals: expected +] + { #category : #tests } DataFrameTest >> testDetect [ | actual expected | diff --git a/src/DataFrame/DataFrame.class.st b/src/DataFrame/DataFrame.class.st index 73dfb830..d7f6e126 100644 --- a/src/DataFrame/DataFrame.class.st +++ b/src/DataFrame/DataFrame.class.st @@ -942,6 +942,51 @@ DataFrame >> defaultHeadTailSize [ ^ 5 ] +{ #category : #statistics } +DataFrame >> describe [ + "method to statistically describe a numerical dataframe" + + | nCol nRow describeDF col count dtype | + nCol := self numberOfColumns. + nRow := self numberOfRows. + describeDF := self class new: nCol @ 9. + describeDF columnNames: + #( 'count' 'mean' 'std' 'min' '25%' '50%' '75%' 'max' 'dtype' ). + describeDF rowNames: self columnNames. + 1 to: nCol do: [ :i | + | mean std mini fQ sQ tQ maxi | + col := self columnAt: i. + count := col countNonNils. + count = 0 ifFalse: [ + col := col removeNils. + mean := col average. + std := col stdev. + mini := col min. + fQ := col firstQuartile. + sQ := col secondQuartile. + tQ := col thirdQuartile. + maxi := col max ]. + dtype := col calculateDataType. + describeDF at: i at: 1 put: count. + + describeDF at: i at: 2 put: mean. + + describeDF at: i at: 3 put: std. + + describeDF at: i at: 4 put: mini. + + describeDF at: i at: 5 put: fQ. + + describeDF at: i at: 6 put: sQ. + + describeDF at: i at: 7 put: tQ. + + describeDF at: i at: 8 put: maxi. + + describeDF at: i at: 9 put: dtype ]. + ^ describeDF +] + { #category : #accessing } DataFrame >> dimensions [ "Returns the number of rows and number of columns in a DataFrame" From e13f4d5b6f25e4f59905359aa1787c0929635d9c Mon Sep 17 00:00:00 2001 From: Joshua-Dias-Barreto Date: Thu, 25 May 2023 02:15:36 +0530 Subject: [PATCH 2/2] Implemented a cleaner and more efficient describe method. --- src/DataFrame/DataFrame.class.st | 60 ++++++++++---------------------- 1 file changed, 19 insertions(+), 41 deletions(-) diff --git a/src/DataFrame/DataFrame.class.st b/src/DataFrame/DataFrame.class.st index d26aab62..dbc5aa20 100644 --- a/src/DataFrame/DataFrame.class.st +++ b/src/DataFrame/DataFrame.class.st @@ -943,47 +943,25 @@ DataFrame >> defaultHeadTailSize [ { #category : #statistics } DataFrame >> describe [ - "method to statistically describe a numerical dataframe" - - | nCol nRow describeDF col count dtype | - nCol := self numberOfColumns. - nRow := self numberOfRows. - describeDF := self class new: nCol @ 9. - describeDF columnNames: - #( 'count' 'mean' 'std' 'min' '25%' '50%' '75%' 'max' 'dtype' ). - describeDF rowNames: self columnNames. - 1 to: nCol do: [ :i | - | mean std mini fQ sQ tQ maxi | - col := self columnAt: i. - count := col countNonNils. - count = 0 ifFalse: [ - col := col removeNils. - mean := col average. - std := col stdev. - mini := col min. - fQ := col firstQuartile. - sQ := col secondQuartile. - tQ := col thirdQuartile. - maxi := col max ]. - dtype := col calculateDataType. - describeDF at: i at: 1 put: count. - - describeDF at: i at: 2 put: mean. - - describeDF at: i at: 3 put: std. - - describeDF at: i at: 4 put: mini. - - describeDF at: i at: 5 put: fQ. - - describeDF at: i at: 6 put: sQ. - - describeDF at: i at: 7 put: tQ. - - describeDF at: i at: 8 put: maxi. - - describeDF at: i at: 9 put: dtype ]. - ^ describeDF + "Answer another data frame with statistics describing the columns of this data frame" + + | content | + content := self numericalColumns collect: [ :column | + { + column countNonNils. + column average. + column stdev. + column min. + column firstQuartile. + column secondQuartile. + column thirdQuartile. + column max. + column calculateDataType } ]. + + ^ self class + withRows: content + rowNames: self numericalColumnNames + columnNames: #( count mean std min '25%' '50%' '75%' max dtype ) ] { #category : #accessing }