In [1]:
#!fsharp
#r "nuget: canopy"
#r "nuget: Selenium.WebDriver.ChromeDriver, 87.0.4280.8800"
#r "nuget: SixLabors.ImageSharp, 1.0.2"

Installed package Selenium.WebDriver.ChromeDriver version 87.0.4280.8800

In [1]:
#!fsharp
let getSearchUrl (query: string) =
    $"https://www.google.com/search?q={query}&sclient=img&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiJwLa-7s_tAhUH9IUKHfwYCaYQ_AUoAXoECBIQAw&biw=1536&bih=719&dpr=1.25"

In [1]:
#!fsharp
open System
open canopy.configuration
open canopy.classic
open OpenQA.Selenium
open SixLabors.ImageSharp
open SixLabors.ImageSharp.Processing

In [1]:
#!fsharp
canopy.configuration.chromeDir <- @"C:\Users\grego\.nuget\packages\selenium.webdriver.chromedriver\87.0.4280.8800\driver\win32"
start chrome

In [1]:
#!fsharp
url (getSearchUrl "nikolaus")

In [1]:
#!fsharp
let elem =
    elements "div#islmp img"
    |> List.head

click elem

In [1]:
#!fsharp
let img_elem =
    element "div#islsp img"

img_elem.GetAttribute("src")

In [1]:
#!fsharp
let getImgUrls (n: int) (query: string) =
    let searchUrl = getSearchUrl query
    url searchUrl
    sleep 1

    let imagesToClick =
        elements "div#islmp a.wXeWr.islib.nfEiy.mM5pbd img"

    let toTake = min (List.length imagesToClick) n

    let getImageUrl (elem : IWebElement) =
        try
            click elem
            sleep 1

            // nah this is not brittle and hacky as hell at all
            elem |> parent |> parent |> fun e -> e.GetAttribute("href")
            |> fun s -> s.Split('?').[1].Split('&').[0].Substring(7)
            |> Uri.UnescapeDataString
            |> Some
        with
        | e -> None

    imagesToClick
    |> List.take toTake
    |> List.map getImageUrl
    |> List.filter Option.isSome
    |> List.map (Option.defaultValue String.Empty)

let queryString = "african+person+fur+clothing"
let imgUrls = getImgUrls 50 queryString

In [1]:
#!fsharp
DisplayFunctions.display imgUrls

DisplayFunctions.HTML $"<img src=\"{imgUrls |> List.skip 8 |> List.head}\"></img>"

index,value
0,https://d279m997dpfwgl.cloudfront.net/wp/2019/03/IMG_0589-e1551711522339-1000x1180.jpg
1,https://static01.nyt.com/images/2019/01/31/fashion/31FUR1/31FUR1-superJumbo-v2.jpg
2,https://d279m997dpfwgl.cloudfront.net/wp/2019/03/AP_19031558773838.jpg
3,https://static01.nyt.com/images/2019/01/31/fashion/31FUR4/merlin_148714461_7f00a1a2-7f83-4860-995d-15c19e79a6cc-jumbo.jpg
4,https://i.pinimg.com/236x/23/f1/83/23f1831e4154ecb83a0c96c4a7612885.jpg
5,https://i.pinimg.com/236x/65/9d/44/659d44699fbe8abc5e1f0435d82c708b.jpg
6,https://i.pinimg.com/564x/3d/de/89/3dde890950976ad89ad7a57ff807d769.jpg
7,https://media.istockphoto.com/photos/its-this-big-picture-id184910757
8,https://static.messynessychic.com/wp-content/uploads/2020/02/The_negro_who_reached_the_Pole.jpg
9,https://static01.nyt.com/images/2019/01/31/fashion/31FUR6/merlin_149951085_f7eb4bc6-78a7-4dfa-b39c-fe87e6ce5ce2-jumbo.jpg


In [1]:
#!fsharp
let imageSourcesTarget = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\image_sources.tsv"

In [1]:
#!fsharp
imgUrls
|> List.map (fun s -> $"{queryString}\t{s}")
|> fun lines -> File.AppendAllLines(imageSourcesTarget, lines)

In [1]:
#!fsharp
// in case you need to recreate the whole dataset again from the sources
// let urls =
//     File.ReadAllLines imageSourcesTarget
//     |> Array.skip 1
//     |> Array.map (fun s -> s.Split('\t').[1])

let urls =
    imgUrls
    |> Array.ofList

In [1]:
#!fsharp
open System.Net.Http

let httpClient = new HttpClient()

In [1]:
#!fsharp
let req =
    urls
    |> Array.head
    |> httpClient.GetAsync
    |> Async.AwaitTask
    |> Async.RunSynchronously

In [1]:
#!fsharp
let bytes =
    req.Content.ReadAsByteArrayAsync()
    |> Async.AwaitTask
    |> Async.RunSynchronously

In [1]:
#!fsharp
let format = Image.DetectFormat(bytes)
display <| format.Name.ToLower()

jpeg

In [1]:
#!fsharp
let rawFolder = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\raw"

In [1]:
#!fsharp
let downloadImage (uri: string) =
    let req =
        httpClient.GetAsync uri
        |> Async.AwaitTask
        |> Async.RunSynchronously

    if req.IsSuccessStatusCode && isNull req.Content |> not then
        let bytes =
            req.Content.ReadAsByteArrayAsync()
            |> Async.AwaitTask
            |> Async.RunSynchronously

        let format = Image.DetectFormat(bytes)

        let guid = Guid.NewGuid()

        let ext = if isNull format || isNull format.Name then String.Empty else "." + format.Name.ToLower()
        let fileName = $"{guid}{ext}"
        File.WriteAllBytes(Path.Combine(rawFolder, fileName), bytes)

        Some (uri, guid, fileName)
    else
        display $"{uri}: status code {req.StatusCode}" |> ignore
        None

In [1]:
#!fsharp
let processedImages =
    urls
    |> Array.map downloadImage

In [1]:
#!fsharp
let imageDownloadsPathFile = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\image_downloads.tsv"

processedImages
|> Array.filter Option.isSome
|> Array.map (Option.defaultValue (String.Empty, Guid.Empty, String.Empty))
|> Array.map (fun (uri, id, name) -> $"{uri}\t{id}\t{name}")
|> fun lines -> File.AppendAllLines(imageDownloadsPathFile, lines)