In [1]:
#!fsharp
#r "nuget: canopy"
#r "nuget: Selenium.WebDriver.ChromeDriver, 87.0.4280.8800"
#r "nuget: FSharp.Data"
#r "nuget: SixLabors.ImageSharp, 1.0.2"

Installed package Selenium.WebDriver.ChromeDriver version 87.0.4280.8800

In [1]:
#!fsharp
let getSearchUrl (query: string) =
    $"https://www.google.com/search?q={query}&sclient=img&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiJwLa-7s_tAhUH9IUKHfwYCaYQ_AUoAXoECBIQAw&biw=1536&bih=719&dpr=1.25"

In [1]:
#!fsharp
open System
open canopy.runner.classic
open canopy.configuration
open canopy.classic
open OpenQA.Selenium

In [1]:
#!fsharp
canopy.configuration.chromeDir <- @"C:\Users\grego\.nuget\packages\selenium.webdriver.chromedriver\87.0.4280.8800\driver\win32"
start chrome

In [1]:
#!fsharp
url (getSearchUrl "nikolaus")

In [1]:
#!fsharp
let elem =
    elements "div#islmp img"
    |> List.head

click elem

In [1]:
#!fsharp
let img_elem =
    element "div#islsp img"

img_elem.GetAttribute("src")

https://bilder.t-online.de/b/82/85/69/34/id_82856934/c_Master-1-1-Large/tid_da/mann-als-bischof-von-myra-verkleidet-der-6-dezember-gilt-als-todestag-des-heiligen-nikolaus-.jpg

In [1]:
#!fsharp
let getImgUrls (n: int) (query: string) =
    let searchUrl = getSearchUrl query
    url searchUrl
    sleep 1

    let imagesToClick =
        elements "div#islmp a.wXeWr.islib.nfEiy.mM5pbd img"

    let toTake = min (List.length imagesToClick) n

    let getImageUrl (elem : IWebElement) =
        try
            click elem
            sleep 1

            // nah this is not brittle and hacky as hell at all
            elem |> parent |> parent |> fun e -> e.GetAttribute("href")
            |> fun s -> s.Split('?').[1].Split('&').[0].Substring(7)
            |> Uri.UnescapeDataString
            |> Some
        with
        | e -> None

    imagesToClick
    |> List.take toTake
    |> List.map getImageUrl
    |> List.filter Option.isSome
    |> List.map (Option.defaultValue String.Empty)

let queryString = "person+in+summer"
let imgUrls = getImgUrls 50 queryString

In [1]:
#!fsharp
DisplayFunctions.display imgUrls

DisplayFunctions.HTML $"<img src=\"{imgUrls |> List.skip 8 |> List.head}\"></img>"

index,value
0,https://imgix.bustle.com/uploads/getty/2020/1/30/421b844f-3adb-441f-87ac-15712a2f685c-getty-1201679705.jpg
1,https://www.champneys.com/getattachment/e91862c3-78d3-4754-8a0a-b1d1c9fe707d/shutterstock_558593737-Copy.jpg?lang=en-US&width=2000&height=1000&ext=.jpg
2,https://previews.123rf.com/images/olesiabilkei/olesiabilkei1505/olesiabilkei150500186/40593756-happy-fashionable-kid-boy-enjoys-life-on-summer-beach.jpg
3,https://cdn.pixabay.com/photo/2015/08/28/14/55/girl-911983_960_720.jpg
4,https://www.raleighmedicalgroup.com/Portals/15/Blog%20Pictures/10%20Ways%20To%20Stay%20Hydrated%20This%20Summer.jpg
5,https://www.jetsetter.com/wp-content/uploads/sites/7/2018/06/GettyImages-627010960.jpg
6,https://www.jacuzzisaunas.ca/wp-content/uploads/2017/07/Fit-Woman-Standing-on-the-Beach-in-Summer.jpg
7,https://2zadtx2dccrd43bc0r18hhqj-wpengine.netdna-ssl.com/wp-content/uploads/sites/8/2019/09/Sitting-At-Beach_G_1055265486.jpg
8,https://images.theconversation.com/files/283408/original/file-20190709-44487-zuosr4.jpg?ixlib=rb-1.1.0&rect=126%2C387%2C5754%2C3476&q=45&auto=format&w=496&fit=clip
9,https://wilsonkubwayo.com/wp-content/uploads/2017/09/Wilson-Beach-1080x675.jpg


In [1]:
#!fsharp
let imageSourcesTarget = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\image_sources.tsv"

In [1]:
#!fsharp
imgUrls
|> List.map (fun s -> $"{queryString}\t{s}")
|> fun lines -> File.AppendAllLines(imageSourcesTarget, lines)

In [1]:
#!fsharp
let urls =
    File.ReadAllLines imageSourcesTarget
    |> Array.skip 1
    |> Array.map (fun s -> s.Split('\t').[1])

In [1]:
#!fsharp
open System.Net.Http

let httpClient = new HttpClient()

In [1]:
#!fsharp
let req =
    urls
    |> Array.head
    |> httpClient.GetAsync
    |> Async.AwaitTask
    |> Async.RunSynchronously

In [1]:
#!fsharp
let bytes =
    req.Content.ReadAsByteArrayAsync()
    |> Async.AwaitTask
    |> Async.RunSynchronously

In [1]:
#!fsharp
open SixLabors.ImageSharp
open SixLabors.ImageSharp.Processing

let format = Image.DetectFormat(bytes)
display <| format.Name.ToLower()

jpeg

In [1]:
#!fsharp
let rawFolder = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\raw"

In [1]:
#!fsharp
let downloadImage (uri: string) =
    let req =
        httpClient.GetAsync uri
        |> Async.AwaitTask
        |> Async.RunSynchronously

    if req.IsSuccessStatusCode && isNull req.Content |> not then
        let bytes =
            req.Content.ReadAsByteArrayAsync()
            |> Async.AwaitTask
            |> Async.RunSynchronously

        let format = Image.DetectFormat(bytes)

        let guid = Guid.NewGuid()

        let ext = if isNull format || isNull format.Name then String.Empty else "." + format.Name.ToLower()
        let fileName = $"{guid}{ext}"
        File.WriteAllBytes(Path.Combine(rawFolder, fileName), bytes)

        Some (uri, guid, fileName)
    else
        display $"{uri}: status code {req.StatusCode}" |> ignore
        None

In [1]:
#!fsharp
let processedImages =
    urls
    |> Array.map downloadImage

https://www.wallpaperup.com/uploads/wallpapers/2013/02/22/43089/89d0ac7b29a3a70028ebd517cc10796c.jpg: status code BadRequest

https://www.viehhofen.at/user_upload/eisstockschiessen-gruppe.jpg: status code NotFound

In [1]:
#!fsharp
let imageDownloadsPathFile = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\image_downloads.tsv"

processedImages
|> Array.filter Option.isSome
|> Array.map (Option.defaultValue (String.Empty, Guid.Empty, String.Empty))
|> Array.map (fun (uri, id, name) -> $"{uri}\t{id}\t{name}")
|> fun lines -> File.AppendAllLines(imageDownloadsPathFile, lines)