In [1]:
#!fsharp
#r "nuget: canopy"
#r "nuget: Selenium.WebDriver.ChromeDriver, 87.0.4280.8800"
#r "nuget: SixLabors.ImageSharp, 1.0.2"

Installed package SixLabors.ImageSharp version 1.0.2

Installed package canopy version 2.1.5

Installing package Selenium.WebDriver.ChromeDriver, version 87.0.4280.8800................

In [1]:
#!fsharp
let getSearchUrl (query: string) =
    $"https://www.google.com/search?q={query}&sclient=img&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiJwLa-7s_tAhUH9IUKHfwYCaYQ_AUoAXoECBIQAw&biw=1536&bih=719&dpr=1.25"

In [1]:
#!fsharp
open System
open canopy.configuration
open canopy.classic
open OpenQA.Selenium
open SixLabors.ImageSharp
open SixLabors.ImageSharp.Processing

In [1]:
#!fsharp
canopy.configuration.chromeDir <- @"C:\Users\grego\.nuget\packages\selenium.webdriver.chromedriver\87.0.4280.8800\driver\win32"
start chrome

In [1]:
#!fsharp
url (getSearchUrl "nikolaus")

In [1]:
#!fsharp
let elem =
    elements "div#islmp img"
    |> List.head

click elem

In [1]:
#!fsharp
let img_elem =
    element "div#islsp img"

img_elem.GetAttribute("src")

In [1]:
#!fsharp
let getImgUrls (n: int) (query: string) =
    let searchUrl = getSearchUrl query
    url searchUrl
    sleep 1

    let imagesToClick =
        elements "div#islmp a.wXeWr.islib.nfEiy.mM5pbd img"

    let toTake = min (List.length imagesToClick) n

    let getImageUrl (elem : IWebElement) =
        try
            click elem
            sleep 1

            // nah this is not brittle and hacky as hell at all
            elem |> parent |> parent |> fun e -> e.GetAttribute("href")
            |> fun s -> s.Split('?').[1].Split('&').[0].Substring(7)
            |> Uri.UnescapeDataString
            |> Some
        with
        | e -> None

    imagesToClick
    |> List.take toTake
    |> List.map getImageUrl
    |> List.filter Option.isSome
    |> List.map (Option.defaultValue String.Empty)

let queryString = "person+standing+in+front+of+door"
let imgUrls = getImgUrls 50 queryString

In [1]:
#!fsharp
DisplayFunctions.display imgUrls

DisplayFunctions.HTML $"<img src=\"{imgUrls |> List.skip 8 |> List.head}\"></img>"

index,value
0,https://as1.ftcdn.net/jpg/00/73/66/14/500_F_73661493_gbPKqQ0ngt2oQXFCJp1k9PBkdjr9f7RC.jpg
1,https://c8.alamy.com/comp/C788MG/silhouette-of-a-man-standing-in-front-of-an-open-door-C788MG.jpg
2,https://st3.depositphotos.com/2299955/14949/i/1600/depositphotos_149493202-stock-photo-mature-man-standing-in-front.jpg
3,https://previews.123rf.com/images/enki/enki1303/enki130300009/18708238-business-man-standing-in-front-of-the-door.jpg
4,https://image.freepik.com/free-photo/man-casual-outfit-standing-front-door_23-2148248475.jpg
5,https://previews.123rf.com/images/dglimages/dglimages1705/dglimages170500043/78146911-happy-senior-man-standing-outside-the-front-door-of-his-home-he-has-his-arms-crossed-and-is-looking-.jpg
6,https://c8.alamy.com/comp/C788NB/silhouette-of-a-man-standing-in-front-of-an-open-door-C788NB.jpg
7,https://st.focusedcollection.com/14026668/i/1800/focused_183337978-stock-photo-mature-man-standing-front-door.jpg
8,https://c8.alamy.com/comp/C3HKTA/young-woman-standing-at-front-door-C3HKTA.jpg
9,https://cdn7.dissolve.com/p/D943_217_971/D943_217_971_1200.jpg


In [1]:
#!fsharp
let imageSourcesTarget = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\image_sources.tsv"

In [1]:
#!fsharp
imgUrls
|> List.map (fun s -> $"{queryString}\t{s}")
|> fun lines -> File.AppendAllLines(imageSourcesTarget, lines)

In [1]:
#!fsharp
// in case you need to recreate the whole dataset again from the sources
// let urls =
//     File.ReadAllLines imageSourcesTarget
//     |> Array.skip 1
//     |> Array.map (fun s -> s.Split('\t').[1])

let urls =
    imgUrls
    |> Array.ofList

In [1]:
#!fsharp
open System.Net.Http

let httpClient = new HttpClient()

In [1]:
#!fsharp
let req =
    urls
    |> Array.head
    |> httpClient.GetAsync
    |> Async.AwaitTask
    |> Async.RunSynchronously

In [1]:
#!fsharp
let bytes =
    req.Content.ReadAsByteArrayAsync()
    |> Async.AwaitTask
    |> Async.RunSynchronously

In [1]:
#!fsharp
let format = Image.DetectFormat(bytes)
display <| format.Name.ToLower()

jpeg

In [1]:
#!fsharp
let rawFolder = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\raw"

In [1]:
#!fsharp
let downloadImage (uri: string) =
    let req =
        try
            httpClient.GetAsync uri
            |> Async.AwaitTask
            |> Async.RunSynchronously
            |> Some
        with e ->
            display $"Req failed. Message: {e.Message}" |> ignore
            None

    match req with
    | Some req when req.IsSuccessStatusCode && (isNull req.Content |> not) ->
        let bytes =
            req.Content.ReadAsByteArrayAsync()
            |> Async.AwaitTask
            |> Async.RunSynchronously

        let format = Image.DetectFormat(bytes)

        let guid = Guid.NewGuid()

        let ext = if isNull format || isNull format.Name then String.Empty else "." + format.Name.ToLower()
        let fileName = $"{guid}{ext}"
        File.WriteAllBytes(Path.Combine(rawFolder, fileName), bytes)

        Some (uri, guid, fileName)
    | _ ->
        display $"{uri}: could not be processed" |> ignore
        None

In [1]:
#!fsharp
let processedImages =
    urls
    |> Array.map downloadImage

In [1]:
#!fsharp
let imageDownloadsPathFile = @"C:\Users\grego\source\repos\IsItKrampus.NET\data\image_downloads.tsv"

processedImages
|> Array.filter Option.isSome
|> Array.map (Option.defaultValue (String.Empty, Guid.Empty, String.Empty))
|> Array.map (fun (uri, id, name) -> $"{uri}\t{id}\t{name}")
|> fun lines -> File.AppendAllLines(imageDownloadsPathFile, lines)