In [327]:
#r "nuget:XPlot.Plotly"
#r "nuget:Deedle"
#r "nuget:DotNetZip"
#r "nuget:NodaTime"

**Type real password here**

In [363]:
let password = "type password here"

In [364]:
let allLinesSource = 
    [ 
        ("BadLines",       [ 490; 675; 677; 681; 671 ])
        ("GoodLines",      [ 783; 834; 782; 837; 785 ])
        ("InnerCityLines", [ 403; 505; 216; 906; 302 ])
    ]
let singleLineSource folder lineName = [ (folder, [ lineName ]) ]    

**Choose source lines to load**

In [365]:
//let sources = singleLineSource "BadLines" 490
let sources = allLinesSource

**Choose stops to work with**

In [366]:
//let stops = [ "40870" ]
let stops = []

**Specify desired daily time periods**

In [367]:
open NodaTime

let rec getPeriod t = function
    | [] -> "Unknown"
    | (p, check)::tail when check t -> p
    | _::tail -> getPeriod t tail
    
let inPeriod from' to' (t:ZonedDateTime) = from' <= t.Hour && t.Hour < to'
let periods = 
    [ 
        ("06:00 - 10:00", (inPeriod 6  10))
        ("10:00 - 14:00", (inPeriod 10 14))
        ("14:00 - 18:00", (inPeriod 14 18))
        ("18:00 - 22:00", (inPeriod 18 22))
        ("Other", (inPeriod 0 24))
    ]

In [368]:
module Frame =
    let groupByType<'T, 'O when 'T : equality and 'O : equality> c (f:Frame<'O, string>) =
        let result : Frame<'T * 'O, string> = f |> Frame.groupRowsBy c 
        result

In [369]:
open Deedle
open Ionic.Zip
open System.IO
open System
open System.Globalization
open NodaTime.Text

let tz = DateTimeZoneProviders.Tzdb.GetZoneOrNull "Australia/Melbourne"

let LoadData folder lineName stops = 
    let sw = System.Diagnostics.Stopwatch.StartNew ()
    let findStop =
        match stops with 
        | [] -> fun _ -> true
        | _  -> fun x -> stops |> List.tryFind (fun s -> s = x) |> Option.isSome

    let zip = ZipFile.Read(sprintf @"Data/Samples/%s/Departures-%s.zip" folder lineName)
    zip.set_Password password
    use stream = zip.Entries |> Seq.head |> fun e -> e.OpenReader ()

    let f = 
        Frame.ReadCsv(stream)
        |> Frame.filterRows (fun i s -> String.IsNullOrEmpty (s.GetAs<string> "ScheduledArrivalTime") |> not)
        |> Frame.filterRows (fun i s -> s.GetAs<string> "IrsStopCode" |> string |> findStop)
    
    f?Deviation <- 
        f.GetColumn<string> "ArrivalTime"
        |> Series.zipInner (f.GetColumn<string> "ScheduledArrivalTime" ) 
        |> Series.mapValues (fun (sa, aa) -> DateTime.Parse sa, DateTime.Parse aa)
        |> Series.mapValues (fun (sa, aa) -> aa - sa |> fun ts -> ts.TotalSeconds)
    f?OperatingDay <- f.Columns.Get("OperatingDay").As<DateTime>() |> Series.mapValues (fun x -> LocalDate(x.Year, x.Month, x.Day))
    f?ScheduledTimeZoned <- 
        f.GetColumn<string> "ScheduledArrivalTime"
        |> Series.mapValues (InstantPattern.General.Parse)
        |> Series.mapValues (fun x -> x.Value.InZone tz)
    f?DayOfWeek <-
        f.GetColumn<LocalDate> "OperatingDay"
        |> Series.mapValues (fun x -> x.DayOfWeek)
    f?Period <-
        f.GetColumn<ZonedDateTime>("ScheduledTimeZoned")
        |> Series.mapValues (fun x -> getPeriod x periods)
    
    let f = 
        f
        |> Frame.groupByType<string, _> "Period"
        |> Frame.groupByType<IsoDayOfWeek, _> "DayOfWeek"
        |> Frame.mapRowKeys Pair.flatten3
    
    display (sprintf "Successfully loaded data for line %s/%s: %d rows in %d ms" folder lineName f.RowCount sw.ElapsedMilliseconds)
    
    f

**This will load all data as specified by code above**

In [370]:
let allFrames =
    sources
    |> Seq.collect (fun (folder, lineNames) -> lineNames |> Seq.map (fun lineName -> folder, string lineName))
    |> Seq.toArray
    |> Seq.map (fun (folder, lineName) -> lineName, ((LoadData folder lineName stops), lineName, folder))
    |> dict
    
display ("All Loaded!")

Successfully loaded data for line BadLines/490: 16114 rows in 1314 ms

Successfully loaded data for line BadLines/675: 10756 rows in 954 ms

Successfully loaded data for line BadLines/677: 27804 rows in 2585 ms

Successfully loaded data for line BadLines/681: 50754 rows in 4217 ms

Successfully loaded data for line BadLines/671: 54641 rows in 4271 ms

Successfully loaded data for line GoodLines/783: 17251 rows in 1889 ms

Successfully loaded data for line GoodLines/834: 231720 rows in 18920 ms

Successfully loaded data for line GoodLines/782: 154610 rows in 12104 ms

Successfully loaded data for line GoodLines/837: 55637 rows in 4964 ms

Successfully loaded data for line GoodLines/785: 164520 rows in 13272 ms

Successfully loaded data for line InnerCityLines/403: 3104 rows in 710 ms

Successfully loaded data for line InnerCityLines/505: 45695 rows in 3440 ms

Successfully loaded data for line InnerCityLines/216: 507730 rows in 39037 ms

Successfully loaded data for line InnerCityLines/906: 541946 rows in 42412 ms

Successfully loaded data for line InnerCityLines/302: 304044 rows in 23936 ms

All Loaded!

In [371]:
type StatsValues = {
    Min    : float
    Max    : float
    Mean   : float
    Median : float
    StdDev : float
    NumberOfValues : float
    
    LineName  : string
    Folder    : string
    DayOfWeek : string
    Period    : string
}

let CalcStats (series:Series<_,_>) = 
    { StatsValues.Min    = series |> Stats.min    |> Math.Round
      StatsValues.Max    = series |> Stats.max    |> Math.Round
      StatsValues.Mean   = series |> Stats.mean   |> Math.Round      
      StatsValues.Median = series |> Stats.median |> Math.Round
      StatsValues.StdDev = series |> Stats.stdDev |> Math.Round
      StatsValues.NumberOfValues = float series.KeyCount
      LineName   = ""
      Folder     = ""
      DayOfWeek  = ""
      Period     = ""
    }

**This will calculate overall statistical measurements on each loaded line**

In [372]:
let allStats = 
    allFrames.Values
    |> Seq.map (fun (f, lineName, folder) -> { CalcStats (f?Deviation) with LineName = lineName; Folder = folder })
    |> Seq.sortBy (fun s -> s.StdDev)
    |> Seq.toArray

display (allStats)
let allStatsFrame = Frame.ofRecords allStats

index,Min,Max,Mean,Median,StdDev,NumberOfValues,LineName,Folder,DayOfWeek,Period
0,-349,1780,71,46,145,55637,837,GoodLines,,
1,-359,3593,141,110,165,27804,677,BadLines,,
2,-170,1712,140,105,196,10756,675,BadLines,,
3,-4722,5029,116,70,200,231720,834,GoodLines,,
4,-548,2556,147,94,214,54641,671,BadLines,,
5,-4448,1887,91,45,224,50754,681,BadLines,,
6,-692,1815,137,124,228,17251,783,GoodLines,,
7,-868,2353,175,127,230,164520,785,GoodLines,,
8,-947,5146,107,65,241,304044,302,InnerCityLines,,
9,-881,2341,60,89,244,16114,490,BadLines,,


In [373]:
let CalcBuckets series =
    let stats = CalcStats series 
    let bucketsNumber = 100.0
    let bucketSize = (stats.Max - stats.Min) / bucketsNumber

    let getBucket _ (v:int) = (float v - stats.Min) / bucketSize |> int

    let distribution = series |> Series.groupInto getBucket (fun b s -> s.KeyCount)
    let maxBucket = distribution.Max()
    bucketsNumber, bucketSize, maxBucket


In [374]:
open XPlot.Plotly

let MakePlot series forWhat w h =
    let title = sprintf "Deviation from Schedule (sec) for %s" forWhat
    let stats = CalcStats series
    let _, bucketSize, maxBucket = CalcBuckets series

    display (stats)

    let hist = 
        Histogram(
            x = series.Values, 
            xbins   = Xbins(start = stats.Min, ``end`` = stats.Max, size = bucketSize), 
            marker  = Marker(color = "yellow", line = Line(color = "gray", width = 1)),
            opacity = 0.75, 
            name = "Distribution"
        )

    let zero   = Scatter ( name = "Zero",   x = [ 0; 0 ], y = [ 0; maxBucket ])
    let mean   = Scatter ( name = "Mean",   x = [ stats.Mean; stats.Mean ],     y = [ 0; maxBucket ])
    let median = Scatter ( name = "Median", x = [ stats.Median; stats.Median ], y = [ 0; maxBucket ])

    let stdDev = 
        Scatter(
            x = [ stats.Mean-stats.StdDev; stats.Mean-stats.StdDev; stats.Mean+stats.StdDev; stats.Mean+stats.StdDev ],
            y = [ maxBucket; 0; 0; maxBucket ],
            name = "StdDev"
        )

    let traces = [ hist :> Trace; mean :> Trace; median :> Trace; stdDev :> Trace; zero :> Trace ]

    let plot = 
        traces
        |> Chart.Plot
        |> Chart.WithXTitle "Deviation"
        |> Chart.WithYTitle "Numner of arrivals"
        |> Chart.WithTitle title
        |> Chart.WithWidth w
        |> Chart.WithHeight h
    plot


In [375]:
let GetFrame lineName =
    let frame, lineName2, folder = 
        if String.IsNullOrEmpty lineName 
            then allFrames.Values |> Seq.head
            else allFrames.[lineName]

    let title = sprintf "%s, %s" lineName2 folder;

    frame, title

In [376]:
let CreatePlot (f:Frame<_,_>) title = MakePlot (f.Columns.Get("Deviation").As<int>()) title 800 600    

**The following will display a distribution chart for the first loaded line.**

In [377]:
display (GetFrame "" ||> CreatePlot)

Min,Max,Mean,Median,StdDev,NumberOfValues,LineName,Folder,DayOfWeek,Period
-881,2341,60,89,244,16114,,,,


In [378]:
let PrepareFrame predicates f title =
    let rec prepare f title = function
        | [] -> f, title
        | (predicate, v)::tail -> 
            let f = f |> Frame.filterRows (fun i s -> predicate s)
            let title = sprintf "%s, %s" title v
            prepare f title tail    
    prepare f title predicates
    
let Compare c (v:'T) =
    (fun (s:ObjectSeries<_>) -> s.GetAs<'T> c = v), (string v)

**The following will display a distribution chart for the first loaded line, filtered down to the desired day-of-week and daily timeframe.**

In [379]:
GetFrame ""
||> PrepareFrame 
    [Compare "DayOfWeek" IsoDayOfWeek.Monday
     Compare "Period"    "06:00 - 10:00"]
||> CreatePlot
|> display

Min,Max,Mean,Median,StdDev,NumberOfValues,LineName,Folder,DayOfWeek,Period
-539,499,41,70,189,979,,,,


**The following code resamples the source data into (Day-Of-Week x Period) buckets.**

In [380]:
let resampledStats =
    allFrames.Values
    |> Seq.collect (fun (f, lineName, folder) -> 
        let sw = System.Diagnostics.Stopwatch.StartNew ()
        let result = 
            f?Deviation
            |> Series.sortByKey 
            |> Series.resampleEquiv (fun (dow, p, _) -> dow, p) 
            |> Series.map (fun (dow, p) s -> { CalcStats s with LineName = lineName; Folder = folder; DayOfWeek = string dow; Period = p })
            |> fun x -> x.Values
            |> Seq.where (fun s -> s.Period <> "Other")
        display (sprintf "Resampled %s/%s in %d ms" folder lineName sw.ElapsedMilliseconds )
        result)
    |> Seq.toArray
    
display (resampledStats)
    
let resampledStatsFrame = resampledStats |> Frame.ofRecords

Resampled BadLines/490 in 78 ms

Resampled BadLines/675 in 48 ms

Resampled BadLines/677 in 113 ms

Resampled BadLines/681 in 253 ms

Resampled BadLines/671 in 293 ms

Resampled GoodLines/783 in 85 ms

Resampled GoodLines/834 in 1169 ms

Resampled GoodLines/782 in 709 ms

Resampled GoodLines/837 in 218 ms

Resampled GoodLines/785 in 688 ms

Resampled InnerCityLines/403 in 7 ms

Resampled InnerCityLines/505 in 187 ms

Resampled InnerCityLines/216 in 2218 ms

Resampled InnerCityLines/906 in 2321 ms

Resampled InnerCityLines/302 in 1337 ms

index,Min,Max,Mean,Median,StdDev,NumberOfValues,LineName,Folder,DayOfWeek,Period
0,-539,499,41,70,189,979,490,BadLines,Monday,06:00 - 10:00
1,-523,548,108,153,218,582,490,BadLines,Monday,10:00 - 14:00
2,-715,2205,102,113,319,1253,490,BadLines,Monday,14:00 - 18:00
3,-570,1929,16,16,369,303,490,BadLines,Monday,18:00 - 22:00
4,-481,522,27,59,167,1029,490,BadLines,Tuesday,06:00 - 10:00
5,-501,536,22,85,233,485,490,BadLines,Tuesday,10:00 - 14:00
6,-881,2341,131,140,380,1135,490,BadLines,Tuesday,14:00 - 18:00
7,-565,2055,51,33,386,245,490,BadLines,Tuesday,18:00 - 22:00
8,-578,491,38,83,188,1087,490,BadLines,Wednesday,06:00 - 10:00
9,-461,518,75,128,208,531,490,BadLines,Wednesday,10:00 - 14:00


# What line has the smallest actual arrival time distribution ? What is it ? 

**Top 10 buckets with smallest StdDev(Deviation):**

In [388]:
let top10Smallest = 
    resampledStats
    |> Seq.sortBy (fun x -> x.StdDev)
    |> Seq.truncate 10
    |> Seq.toArray

display(top10Smallest)

index,Min,Max,Mean,Median,StdDev,NumberOfValues,LineName,Folder,DayOfWeek,Period
0,-283,76,-81,-69,75,26,681,BadLines,Sunday,18:00 - 22:00
1,-271,245,-36,-34,82,601,681,BadLines,Saturday,18:00 - 22:00
2,-140,306,43,32,86,544,837,GoodLines,Sunday,06:00 - 10:00
3,-182,324,73,59,88,321,677,BadLines,Saturday,06:00 - 10:00
4,-139,689,99,95,88,1255,671,BadLines,Tuesday,18:00 - 22:00
5,-184,526,92,85,90,2006,671,BadLines,Thursday,10:00 - 14:00
6,-196,537,70,58,90,2177,671,BadLines,Saturday,14:00 - 18:00
7,-154,460,97,78,96,2428,671,BadLines,Saturday,10:00 - 14:00
8,-137,525,149,142,98,1239,677,BadLines,Wednesday,10:00 - 14:00
9,-161,305,42,44,99,213,675,BadLines,Monday,18:00 - 22:00


# What line has the largest actual arrival time distribution? What is it?

**Top 10 buckets with largest StdDev(Deviation):**

In [390]:
let top10Largest = 
    resampledStats
    |> Seq.sortByDescending (fun x -> x.StdDev)
    |> Seq.truncate 10
    |> Seq.toArray

display(top10Largest)

index,Min,Max,Mean,Median,StdDev,NumberOfValues,LineName,Folder,DayOfWeek,Period
0,-333,4225,410,137,986,1610,505,InnerCityLines,Friday,10:00 - 14:00
1,-342,3349,258,-29,800,435,403,InnerCityLines,Tuesday,10:00 - 14:00
2,-227,2845,425,126,716,62,403,InnerCityLines,Tuesday,06:00 - 10:00
3,-665,7198,287,152,574,18231,216,InnerCityLines,Wednesday,14:00 - 18:00
4,-1062,4851,335,180,549,18196,216,InnerCityLines,Thursday,14:00 - 18:00
5,-590,5028,357,196,539,17962,216,InnerCityLines,Friday,14:00 - 18:00
6,-638,6341,230,110,513,17979,216,InnerCityLines,Tuesday,14:00 - 18:00
7,-615,4169,154,70,498,1815,505,InnerCityLines,Friday,14:00 - 18:00
8,-297,2406,57,-35,486,117,403,InnerCityLines,Tuesday,14:00 - 18:00
9,-538,5136,204,114,455,13612,216,InnerCityLines,Saturday,14:00 - 18:00


# What is the mean distribution per day per 4 hours block? (for all lines, 06:00-10:00 / 10:00-14:00 / 14:00-18:00 / 18:00-22:00)

**Don't quite know how to answer this. Need clarification**

In [381]:
let MakeStatsPlot w h (maxx, maxy, minx, miny) subTitle hc vc gc tc getColor (f:Frame<int, string>) =
    let title = sprintf "Stats %s (%s)" vc hc
    let initx = [ minx; maxx ]
    let inity = [ miny; maxy ]
    let dots = 
        f.Rows
        |> Series.mapValues (fun s -> 
            let x    = s.GetAs<float> hc
            let y    = s.GetAs<float> vc
            let key  = s.GetAs<string> gc
            let text = s.GetAs<string> tc
            key, ((x, y), text))
        |> fun x -> x.Values
        |> Seq.toArray

    let extreme get sort = dots |> Seq.map (fun (_, (xy, _)) -> get xy) |> Seq.sortBy id |> sort |> Seq.tryHead |> Option.defaultValue 0.0

    let maxx = extreme fst id
    let maxy = extreme snd id
    let minx = extreme fst Seq.rev
    let miny = extreme snd Seq.rev

    let traces = 
        dots
        |> Seq.groupBy fst
        |> Seq.map (fun (key, s) -> 
            Scatter ( 
                name = key,  
                x    = (s |> Seq.map (snd >> fst >> fst) |> Seq.toArray),
                y    = (s |> Seq.map (snd >> fst >> snd) |> Seq.toArray),
                text = (s |> Seq.map (snd >> snd) |> Seq.toArray),
                mode = "markers", marker = Marker ( color = getColor key )))
        |> Seq.toList
        |> fun x -> (Scatter (x = initx, y = inity, mode = "markers", name = "aux", marker = Marker (color = "white")))::x

    let plot = 
        traces
        |> Chart.Plot
        |> Chart.WithXTitle hc
        |> Chart.WithYTitle vc
        |> Chart.WithTitle (sprintf "%s - %s" subTitle title)
        |> Chart.WithWidth w
        |> Chart.WithHeight h

    plot, (maxx, maxy, minx, miny)


In [382]:
let colorByDow = function
    | "Monday"    -> "red" 
    | "Tuesday"   -> "orange" 
    | "Wednesday" -> "yellow" 
    | "Thursday"  -> "green" 
    | "Friday"    -> "cyan" 
    | "Saturday"  -> "blue" 
    | "Sunday"    -> "violet" 
    | _ -> "black"

let colorByPeriod = function
    | "06:00 - 10:00" -> "red" 
    | "10:00 - 14:00" -> "yellow" 
    | "14:00 - 18:00" -> "blue" 
    | "18:00 - 22:00" -> "violet" 
    | _ -> "black"
    
let colorByFolder = function
    | "GoodLines"      -> "blue"
    | "BadLines"       -> "red"
    | "InnerCityLines" -> "yellow"
    | _ -> "black"

let MakePlots title hc vc colorBy splitBy tc getColor =
    let MakeStatsPlot =  MakeStatsPlot 800 600
    let plot, mm = 
        resampledStatsFrame 
        |> MakeStatsPlot (0.0, 0.0, 0.0, 0.0) title hc vc colorBy tc getColor  
    display(plot);

    resampledStatsFrame.GetColumn<string> splitBy
    |> fun x -> x.Values
    |> Seq.distinct
    |> Seq.map (fun tf -> 
        resampledStatsFrame 
        |> Frame.filterRows (fun i s -> s.GetAs<string> splitBy = tf)
        |> MakeStatsPlot mm tf hc vc colorBy tc getColor)
    |> Seq.map fst
    |> Seq.toArray
    |> Array.iter (display >> ignore)
   

**The following cells visualize resampled statistics in various ways**

In [383]:
MakePlots "All Timeframes" "Mean" "StdDev" "DayOfWeek" "Period" "LineName"  colorByDow

In [384]:
MakePlots "All Days Of Week" "Mean" "StdDev" "Period" "DayOfWeek" "LineName"  colorByPeriod

In [385]:
MakePlots "All Timeframes" "Mean" "StdDev" "Folder" "Period" "LineName"  colorByFolder

In [386]:
MakePlots "All Days Of Week" "Mean" "StdDev" "Folder" "DayOfWeek" "LineName"  colorByFolder