diff --git a/Commands/Get-Web.ps1 b/Commands/Get-Web.ps1 index b54a6d5..3ce2890 100644 --- a/Commands/Get-Web.ps1 +++ b/Commands/Get-Web.ps1 @@ -3,7 +3,7 @@ .Synopsis Gets content from the web, or parses web content. .Description - Gets content from the web. + Gets content from the web. If -Tag is passed, extracts out tags from within the document. @@ -14,7 +14,7 @@ .Example # Extract the rows from ConvertTo-HTML $text = Get-ChildItem | Select Name, LastWriteTime | ConvertTo-HTML | Out-String - Get-Web "tr" $text + Get-Web -Tag "tr" -Html $text .Example # Extract all PHP elements from a directory of .php scripts Get-ChildItem -Recurse -Filter *.php | @@ -39,13 +39,7 @@ ForEach-Object { Get-Web -Html $_.Xml.InnerText -AsMicrodata -ItemType $schema } - .Example - # List the top 1000 sites on the web: - Get-Web "http://www.google.com/adplanner/static/top1000/" -Tag 'a' | - where-Object {$_.Tag -like "*_blank*" } | - ForEach-Object { - ([xml]$_.StartTag.Replace('"t', '" t')).a.href - } + .Link http://schema.org #> @@ -53,58 +47,46 @@ [CmdletBinding(DefaultParameterSetName='HTML')] [OutputType([PSObject],[string])] param( + # The Url + [Parameter(Mandatory=$true,Position=0,ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Alias('Uri')] + [string]$Url, + # The tags to extract. - [Parameter( - ValueFromPipelineByPropertyName=$true)] + [Parameter(ValueFromPipelineByPropertyName=$true)] [string[]]$Tag, # If used with -Tag, -RequireAttribute will only match tags with a given keyword in the tag + [Parameter(ValueFromPipelineByPropertyName=$true)] [string[]]$TextInTag, # The source HTML. - [Parameter(Mandatory=$true, - ParameterSetName='HTML', - ValueFromPipelineByPropertyName=$true)] + [Parameter(Mandatory=$true,ParameterSetName='HTML',ValueFromPipelineByPropertyName=$true)] [string]$Html, - # The Url - [Parameter(Mandatory=$true, - Position=0, - ParameterSetName='Url', - ValueFromPipelineByPropertyName=$true)] - [Alias('Uri')] - [string]$Url, - # The root of the website. # All images, css, javascript, related links, and pages beneath this root will be downloaded into a hashtable - [Parameter(Mandatory=$true, - ParameterSetName='WGet', - ValueFromPipelineByPropertyName=$true)] + [Parameter(Mandatory=$true,ParameterSetName='WGet',ValueFromPipelineByPropertyName=$true)] [string]$Root, # Any parameters to the URL - [Parameter(ParameterSetName='Url', - Position=1, - ValueFromPipelineByPropertyName=$true)] - [Hashtable]$Parameter, + [Parameter(ParameterSetName='Url',Position=1,ValueFromPipelineByPropertyName=$true)] + [Collections.IDictionary]$Parameter, # Filename - [Parameter(Mandatory=$true, - ParameterSetName='FileName', - ValueFromPipelineByPropertyName=$true)] + [Parameter(Mandatory=$true,ParameterSetName='FileName',ValueFromPipelineByPropertyName=$true)] [Alias('Fullname')] [ValidateScript({$ExecutionContext.SessionState.Path.GetResolvedPSPathFromPSPath($_)})] [string]$FileName, # The User Agent - [Parameter(ParameterSetName='Url', - ValueFromPipelineByPropertyName=$true)] + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] [string]$UserAgent = "PowerShellPipeworks/Get-Web (1.0 powershellpipeworks.com)", # If set, will not show progress for long-running operations [Switch]$HideProgress, - # If set, returns resutls as bytes + # If set, returns results as bytes [Alias('Byte', 'Bytes')] [Switch]$AsByte, @@ -114,10 +96,25 @@ # If set, returns results as json [Switch]$AsJson, + + # If set, will output the results of a web request to a file. + # This is the best option for large content, as it avoids excessive memory consumption. + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Parameter(ParameterSetName='WGet',ValueFromPipelineByPropertyName=$true)] + [string] + $OutputPath, + # An output stream. + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [IO.Stream] + $OutputStream, + # If set, extracts Microdata out of a page [Alias('Microdata')] [Switch]$AsMicrodata, + + # If set, extracts data attributes. + [switch]$DataAttribute, # If set, will get back microdata from the page that matches an itemtype [string[]]$ItemType, @@ -129,375 +126,217 @@ [Switch]$MetaData, # The MIME content type you're requesting from the web site + [Alias('CT')] [string]$ContentType, + # A list of acceptable content types. These are used for the Accept header, and to compare the final content type to determine if it was unexpected + [string[]]$Accept, + # The credential used to connect to the web site - [Parameter(ParameterSetName='Url', - ValueFromPipelineByPropertyName=$true)] + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Alias('Credential','C')] [Management.Automation.PSCredential] $WebCredential, # If set, will use the default user credential to connect to the web site - [Parameter(ParameterSetName='Url', - ValueFromPipelineByPropertyName=$true)] + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] [switch] - $UseDefaultCredential, - + $UseDefaultCredential, - # The HTTP method to use - [Parameter(ParameterSetName='Url', - ValueFromPipelineByPropertyName=$true)] + # The HTTP method. + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] [ValidateSet('GET','POST', 'PUT', 'DELETE', 'OPTIONS', 'HEAD', 'TRACE', 'CONNECT', 'MERGE')] + [Alias('M')] [string]$Method = "GET", # a hashtable of headers to send with the request. - [Hashtable]$Header, + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Alias('Headers','H')] + [Collections.IDictionary]$Header, # The Request Body. This can be either a string, or bytes + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [PSObject] $RequestBody, + # If set, will request the web site asynchronously, and return the results + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Switch] + $Async, + + # The Request String Encoding + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [ValidateScript({[Text.Encoding]::$_ -ne $null})] + [string] + $RequestStringEncoding = "UTF8", + + # The signature message. This parameter is used with -SignatureKey, -SignaturePrefix, and -SignatureAlgorithmn to create an Authorization header. + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [string] + $Signature, + + # The signature prefix. This will be appended before the computed authorization header. + # This parameter is used with -Signature, -SignaturePrefix, and -SignatureAlgorithmn to create an Authorization header + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [string] + $SignaturePrefix, + + # The signature key. This is used to compute the signature hash. This can be either a byte array or a Base64 encoded string + # This parameter is used with -Signature, -SignatureKey, and -SignatureAlgorithmn to create an Authorization header + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [string] + $SignatureKey, + + # The signature algorithmn is the hashing algirthmn that is used to compute a signature hash. The default is HMACSHA256 + # This parameter is used with -Signature, -SignatureKey, and -SignaturePrefix to create an Authorization header + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [ValidateSet('MD5', 'SHA1', 'SHA256', 'SHA384', 'SHA512', 'HMAC', 'HMACSHA1','HMACSHA256')] + [string] + $SignatureAlgorithmn = 'HMACSHA256', + + # If set, the signature will be URL encoded + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Switch] + $EncodeSignature, + + # One or more thumbprints for certificates + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [string[]] + $ThumbPrint, + # Any request ascii data. Data will be joined together with &, and will be sent in the request body. + [Parameter(ParameterSetName='Url',ValueFromPipelineByPropertyName=$true)] + [Alias('d')] [string[]] $Data, - # If set, will use a the Net.WebRequest class to download. Otherwise, will use the xmlhttprequest. - # Xmlhttprequest adds some extra headers and caches GET requests, so, if you wish to avoid this, -UseWebRequest. - [Switch] - $UseWebRequest, + [Parameter(Mandatory=$true,ParameterSetName='AsyncResponse',ValueFromPipelineByPropertyName=$true)] + [Alias('AsyncResult')] + [IAsyncResult] + $IASyncResult, + + [Parameter(Mandatory=$true,ParameterSetName='AsyncResponse',ValueFromPipelineByPropertyName=$true)] + [PSObject] + $WebRequest, # A Progress Identifier. This is used to show progress inside of an existing layer of progress bars. - [int] - $ProgressIdentifier, + [int]$ProgressIdentifier, # If set, the server error will be turned into a result. # This is useful for servers that provide complex error information inside of XML or JSON. + [Parameter(ValueFromPipelineByPropertyName=$true)] [Switch] $UseErrorAsResult, # If set, then a note property will be added to the result containing the response headers + [Parameter(ValueFromPipelineByPropertyName=$true)] [Switch] $OutputResponseHeader, # The amount of time before a web request times out. + [Parameter(ValueFromPipelineByPropertyName=$true)] [Timespan] $Timeout, - # If set, will request the web site asynchronously, and return the results - [Switch] - $Async - ) - - begin { - #region Escape Special Characters - $replacements = @{ - "
" = "
" - "
" = "
" - " " = " " - '¯'='¯' - 'Ð'='Ð' - '¶'='¶' - '¥'='¥' - 'º'='º' - '¹'='¹' - 'ª'='ª' - '­'='­' - '²'='²' - 'Ç'='Ç' - 'Î'='Î' - '¤'='¤' - '½'='½' - '§'='§' - 'Â'='â' - 'Û'='Û' - '±'='±' - '®'='®' - '´'='´' - 'Õ'='Õ' - '¦'='¦' - '£'='£' - 'Í'='Í' - '·'='·' - 'Ô'='Ô' - '¼'='¼' - '¨'='¨' - 'Ó'='Ó' - '°'='°' - 'Ý'='Ý' - 'À'='À' - 'Ö'='Ö' - '"'='"' - 'Ã'='Ã' - 'Þ'='Þ' - '¾'='¾' - '¿'='¿' - '×'='×' - 'Ø'='Ø' - '÷'='÷' - '¡'='¡' - '³'='³' - 'Ï'='Ï' - '¢'='¢' - '©'='©' - 'Ä'='Ä' - 'Ò'='Ò' - 'Å'='Å' - 'È'='È' - 'Ü'='Ü' - 'Á'='Á' - 'Ì'='Ì' - 'Ñ'='Ñ' - 'Ê'='Ê' - '¸'='¸' - 'Ù'='Ù' - 'ß'='ß' - '»'='»' - 'ë'='ë' - 'É'='É' - 'µ'='µ' - '¬'='¬' - 'Ú'='Ú' - 'Æ'='Æ' - '€'= "€" - '—' = '—' - } - #endregion Escape Special Characters - $quotes = '"', "'" - function Convert-Json - { - <# - .Synopsis - Inline JSON converter - .Description - Converts JSON into PowerShell hashtables using regular expressions - #> - param( - # The JSON - [Parameter(ValueFromPipeline=$true)] - [string]$Json, - - # If set, will use full language mode when parsing the data. - # If not set, the data will be parsed in "data-language" mode, which allows for the declaration of hashtables but prevents the execution of code - [switch]$FullLanguage) - - begin { - function ConvertFrom-Hashtable - { - param($results) - $psObject = New-Object PSObject - foreach ($key in $results.Keys) { - $result = $null - if ($results[$key] -is [Hashtable]) { - $result = ConvertFrom-Hashtable $results[$key] - } elseif ($results[$key] -is [Array]) { - $result = foreach ($result in $results[$key]){ - if ($result -is [Hashtable]) { - ConvertFrom-Hashtable $result - } else { - $result - } - } - } else { - $result = $results[$key] - } - - if ($key) { - $psObject.psObject.Properties.Add( - (New-Object Management.Automation.PSNoteProperty $key, $result) - ) - } - - - - } - $psobject - } - } - process { - $json = [Regex]::Replace($Json, - "\\u([\dabcdefABCDEF]{4,4})", { - ("0x" + $args[0].Groups[1].Value) -as [Uint32] -as [Char] - }) - - $json = $Json.Replace('$', '$ ') - - $script = - $json -replace - '“|”', '`"' -replace - '"\s{0,}:', '"=' -replace - "\\{2,2}", "\" -replace - "\[", "$([Environment]::NewLine)@(" -replace - "\]", ")" -replace - ',\[', ", $([Environment]::NewLine)@(" -replace - "\],",")," -replace - '\{"', "@{$([Environment]::NewLine)`"" -replace - "\[\]", "@()" -replace - "=(\w)*(\[)", '=@(' -replace - "=(\d{1,}),",'=$1;' -replace - "=(\d{1,}.\d{1,}),",'=$1;' -replace - "=-(\d{1,}.\d{1,}),",'=-$1;' -replace - "true", "`$true" -replace - "false", "`$false" -replace - "null", '$null' -replace - "\]}", ")}" -replace - "{", "@{" -replace - '\\"', '`"' -replace - "@@", "@" -replace - '(["})]),', "`$1$([Environment]::NewLine)" -replace - '(\$true),', "`$1$([Environment]::NewLine)" -replace - '(\$false),', "`$1$([Environment]::NewLine)" -replace - '(\$null),', "`$1$([Environment]::NewLine)" -replace - "(-{0,1})(\d{1,}),", "`$1`$2$([Environment]::NewLine)" -replace - "\\/","/" -replace - '\$true(\w{1,})', 'true$1' -replace - '\$false(\w{1,})', 'false$1' -replace - '\$null(\w{1,})', 'null$1' - - - $replacements = @(@{ - Find = '}\s{1,}@{' - Replace = '},@{' - }) - foreach ($r in $replacements) { - foreach ($f in $r.find) { - $regex =New-Object Regex $f, "Multiline, IgnoreCase" - $script = $regex.Replace($script , $r.Replace) - } - } - - if ($script.Startswith("[")) - { - $script = "@(" + $script.Substring(1).TrimEnd("]") + ")" - } - $results = $null - Write-Verbose $script - if ($FullLanguage) { - $results = Invoke-Expression "$script" - } else { - $results = Invoke-Expression "data { $script }" - } - - if ($results) { - foreach ($result in $results) {ConvertFrom-Hashtable $result } - } - } - } - - # Add system.web, in case it's not loaded - Add-Type -AssemblyName System.Web + # The size of the upload buffer. + # If you upload a file larger than this size, it will be uploaded in chunks and a progress bar will be displayed. + # Each chunk will be the size of the upload buffer + [Parameter(ValueFromPipelineByPropertyName=$true)] + [Alias('UploadBufferSize','DownloadBufferSize')] + [Uint32] + $BufferSize = 512kb, + # The HTTP HOST + [string] + $HostHeader, - if ($ProgressIdentifier) { - $script:CachedProgressId = $ProgressIdentifier - } + # If set, will preauthenticate the web request. + [Parameter(ValueFromPipelineByPropertyName=$true)] + [Switch] + $PreAuthenticate, - if (-not $script:CachedProgressId) { - $script:CachedProgressId = Get-Random + # If set, will run in a background job. + [Parameter(ValueFromPipelineByPropertyName=$true)] + [Switch] + $AsJob, - } - $progressId = $script:CachedProgressId - } + # If set, will use a the Net.WebRequest class to download. + # Included for backwards compatibility. Prior versions of Get-Web allowed use of xmlHttpRequest with COM. + [Switch] + $UseWebRequest + ) - process { - if ($psCmdlet.ParameterSetName -eq 'WGet') { - if (-not $script:cachedContentTypes) { - $script:cachedContentTypes = @{} - $ctKey = [Microsoft.Win32.Registry]::ClassesRoot.OpenSubKey("MIME\Database\Content Type") - $ctKey.GetSubKeyNames() | - ForEach-Object { - $extension= $ctKey.OpenSubKey($_).GetValue("Extension") - if ($extension) { - $script:cachedContentTypes["${extension}"] = $_ - } - } - - } + begin { + #region Declarations + $WGet = { + # First, we make a copy of root. $currentRoot = "$Root" - + # If the current root looks like it is missing an end slash, if ($currentRoot -like "http*//*" -and $currentRoot -notlike "http*//*/") { - $currentRoot+= '/' + $currentRoot+= '/' # we'll add it. } + # Next, find the host name of the current root. + # This is how we'll know if we should follow the link down. $hostname = ([uri]$currentRoot).DnsSafeHost - $followMeDown = New-OBject Collections.Queue + # Speaking of which, we need to create a queue of links to follow, + $followMeDown = [Collections.Queue]::new() + # add the current root, $null = $followMeDown.Enqueue($currentRoot) - - $pages = @{} + # and create a hashtable to store the results. $pagedata = @{} - while ($followMeDown.Count -gt 0) { - $pageRoot = $followMeDown.Dequeue() - - $pageHost = ([uri]$pageRoot).DnsSafeHost - - if ($pageHost -ne $hostname) { - continue - } - + while ($followMeDown.Count -gt 0) # While the queue isn't empty, + { + # Get the next link + $pageRoot = $followMeDown.Dequeue() + # and determine what host it came from. + $pageHost = ([uri]$pageRoot).DnsSafeHost + # If the link was to a different host, continue to the next. + if ($pageHost -ne $hostname) { continue } + # Now we determine the relative root, $relativeRoot = $pageRoot.Substring(0, $pageRoot.LastIndexOf("/")) - - - - $pageMimetype= - if ($pageRoot -like "http*//*/*.*") { - $extension = $pageRoot.Substring($pageRoot.LastIndexOf(".")) - if ($script:cachedContentTypes[$extension]) { - $script:cachedContentTypes[$extension] - } else { - "unknown/unknown" + # clear the page HTML, + $pageHtml = "" + # and get the content bytes and the headers + $pageContent = Get-Web -Url $pageRoot -AsByte -OutputResponseHeader + # If the returned content type was text, + if ($pageContent.Headers.'Content-Type' -like 'text/*') { + $ms = [IO.MemoryStream]::new($pageContent) + $reader = [IO.StreamReader]::new($ms, $true) + $pagedata[$relativeRoot] = $reader.ReadToEnd() # treat it as text. + $reader.Close();$reader.Dispose() + $ms.Close();$ms.Dispose() + # If it was HTML, save it to $PageHTML, so we can follow subsequent links + if ($pageContent.Headers.'Content-Type' -like '*html*') { + $pageHtml = $pagedata[$relativeRoot] } - } elseif ($pageRoot -like "http*//*/") { - "text/html" } else { - "unknown/unknown" + # If the content wasn't text, save off the bytes. + $pagedata[$relativeRoot] = $pageContent } + # If we don't have any HTML to parse, continue to the next link. + if (-not $pageHtml) { continue } - $pageHtml = "" - if ($pageMimetype -like "text/*") { - $pageHtml = Get-Web -Url $pageRoot -UseWebRequest - $pagedata[$pageRoot] = $pageHtml - } else { - $pagedata[$pageRoot] = Get-Web -Url $pageRoot -UseWebRequest -AsByte - - } - + # Since we have an HTML response, parse out any tags that could contain a link: + # anchors,links,images, and