In [1]:
#!fsharp
// #i "nuget: https://api.nuget.org/v3/index.json"
// #i "nuget: https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-eng/nuget/v3/index.json"
// #i "nuget: https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json"
// #i "nuget: https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json"
// #i "nuget: https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json"
// #i "nuget: https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json"
// #i "nuget: https://www.powershellgallery.com/api/v2/"

#r "nuget: Microsoft.Data.Analysis, 0.4.0"
// #r "nuget: Microsoft.DotNet.Interactive.ExtensionLab, 1.0.0-beta.20574.9"

Installed package Microsoft.Data.Analysis version 0.4.0

In [1]:
#!fsharp
open System
open Microsoft.Data.Analysis
open Microsoft.ML
open Microsoft.DotNet.Interactive.FSharp.FSharpKernelHelpers.DisplayFunctions

In [1]:
#!fsharp
module DataViewFormatter =
    // a lot of differnt questions here - first of all th
    // open System.Text

    // Formatter.SetPreferredMimeTypeFor(typeof<DataFrame>, TabularDataFormatter.MimeType)

    // Formatter.Register<DataFrame>((fun (dataView: DataFrame) (writer: TextWriter) ->
    //     let tabular = dataView.ToTabularJsonString()
    //     writer.Write(tabular)), TabularDataFormatter.MimeType)
    ()

In [1]:
#!fsharp
module DateFrameFormatter = 
    
    // Locally open the F# HTML DSL.
    open Microsoft.DotNet.Interactive.FSharp.FSharpKernelHelpers.Html

    let maxRows = 20

    Formatter.Register<DataFrame>((fun (context: FormatContext) (df: DataFrame) (writer: TextWriter) ->

        // Don't generate nested tables
        if context.ContentThreshold < 1.0 then false else

        // Ask other formatters to reduce information generation
        context.ReduceContent(0.2) |> ignore

        let take = 20
        table [] [
          thead [] [
            th [] [ str "Index" ]
            for c in df.Columns do
              th [] [ str c.Name]
          ]
          tbody [] [
            for i in 0 .. min maxRows (int df.Rows.Count - 1) do
              tr [] [
                td [] [ embed context i ]
                for o in df.Rows.[int64 i] do
                  td [] [ embed context o ]
              ]
          ]
        ]
        |> writer.Write

        true
    ), mimeType = "text/html")
    
    Formatter.Register<DataFrameRow>((fun (context: FormatContext) (row: DataFrameRow) (writer: TextWriter) ->

        // Don't generate nested tables
        if context.ContentThreshold < 1.0 then false else

        // Ask other formatters to reduce information generation
        context.ReduceContent(0.2) |> ignore

        table [] [
          tbody [] [
            tr [] [
              for o in row do
                td [] [ embed context o ] 
            ]
          ]
        ]
        |> writer.Write

        true
    ), mimeType = "text/html")

In [1]:
#!fsharp
let names = [| "John"; "Ahmed"; "Sarah"; "Elif" |]
let salaries = [| 20000; 30000; 40000; 15000 |]
let birthdays = [| DateTime.Parse("1990-4-23"); DateTime.Parse("1982-5-4"); DateTime.Parse("1980-1-2"); DateTime.Parse("1994-10-9") |]
let departments = [| "HR"; "Development"; "Development"; null |]

In [1]:
#!fsharp
let idColumn = PrimitiveDataFrameColumn("Id", [| 1; 2; 3; 4 |])
let nameColumn = StringDataFrameColumn("Name", names)
let birthdaysColumn = PrimitiveDataFrameColumn("Birthday", birthdays)
let salaryColumn = PrimitiveDataFrameColumn("Salary", salaries)

let employeeIdColumn = PrimitiveDataFrameColumn("EmployeeId", [| 3; 1; 2; 4 |])
let departmentColumn = StringDataFrameColumn("Department", departments)

In [1]:
#!fsharp
let employeesDf = DataFrame(idColumn, nameColumn, birthdaysColumn, salaryColumn)
let departmentsDf = DataFrame(employeeIdColumn, departmentColumn)

In [1]:
#!fsharp
display employeesDf
display departmentsDf

Index,Id,Name,Birthday,Salary
0,1,John,1990-04-23 00:00:00Z,20000
1,2,Ahmed,1982-05-04 00:00:00Z,30000
2,3,Sarah,1980-01-02 00:00:00Z,40000
3,4,Elif,1994-10-09 00:00:00Z,15000


Index,EmployeeId,Department
0,3,HR
1,1,Development
2,2,Development
3,4,<null>


In [1]:
#!fsharp
departmentColumn.FillNulls("Other", inPlace = true)

index,value
0,HR
1,Development
2,Development
3,Other


In [1]:
#!fsharp
let currentYear = DateTime.Now.Year;
employeesDf.Columns.["Age"] <- employeesDf.Columns.GetPrimitiveColumn("Birthday").Apply(fun (d: Nullable<DateTime>) -> if d.HasValue then currentYear - d.Value.Year |> Nullable else Nullable<int>())

In [1]:
#!fsharp
employeesDf

Index,Id,Name,Birthday,Salary,Age
0,1,John,1990-04-23 00:00:00Z,20000,30
1,2,Ahmed,1982-05-04 00:00:00Z,30000,38
2,3,Sarah,1980-01-02 00:00:00Z,40000,40
3,4,Elif,1994-10-09 00:00:00Z,15000,26


In [1]:
#!fsharp
let minSalary = employeesDf.Columns.GetPrimitiveColumn<int>("Salary").Min() :?> int
let maxSalary = employeesDf.Columns.GetPrimitiveColumn<int>("Salary").Max() :?> int

employeesDf.Columns.["NormalizedSalary"] <- (employeesDf.Columns.GetPrimitiveColumn<int>("Salary") - minSalary) / (float (maxSalary - minSalary))

In [1]:
#!fsharp
employeesDf

Index,Id,Name,Birthday,Salary,Age,NormalizedSalary
0,1,John,1990-04-23 00:00:00Z,20000,30,0.2
1,2,Ahmed,1982-05-04 00:00:00Z,30000,38,0.6
2,3,Sarah,1980-01-02 00:00:00Z,40000,40,1.0
3,4,Elif,1994-10-09 00:00:00Z,15000,26,0.0


In [1]:
#!fsharp
let df = employeesDf.Merge<int>(departmentsDf, "Id", "EmployeeId", joinAlgorithm = JoinAlgorithm.Inner)
df

Index,Id,Name,Birthday,Salary,Age,NormalizedSalary,EmployeeId,Department
0,3,Sarah,1980-01-02 00:00:00Z,40000,40,1.0,3,HR
1,1,John,1990-04-23 00:00:00Z,20000,30,0.2,1,Development
2,2,Ahmed,1982-05-04 00:00:00Z,30000,38,0.6,2,Development
3,4,Elif,1994-10-09 00:00:00Z,15000,26,0.0,4,Other


In [1]:
#!fsharp
df.Columns.Remove("EmployeeId")
let df = df.OrderBy("Id")

df

Index,Id,Name,Birthday,Salary,Age,NormalizedSalary,Department
0,1,John,1990-04-23 00:00:00Z,20000,30,0.2,Development
1,2,Ahmed,1982-05-04 00:00:00Z,30000,38,0.6,Development
2,3,Sarah,1980-01-02 00:00:00Z,40000,40,1.0,HR
3,4,Elif,1994-10-09 00:00:00Z,15000,26,0.0,Other


In [1]:
#!fsharp
let byDepartment = df.GroupBy("Department")
byDepartment.Mean("Salary")

Index,Department,Salary
0,Development,25000
1,HR,40000
2,Other,15000
