Skip to content

Commit

Permalink
Start work on a curate search query list and evaluator
Browse files Browse the repository at this point in the history
  • Loading branch information
joelverhagen committed Aug 6, 2019
1 parent 01b963f commit 990ec69
Show file tree
Hide file tree
Showing 16 changed files with 1,467 additions and 107 deletions.
213 changes: 213 additions & 0 deletions SearchScorer/SearchScorer/Common/CuratedSearchQueriesCsvReader.cs
@@ -0,0 +1,213 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Web;
using CsvHelper;
using SearchScorer.Feedback;
using SearchScorer.IREvalutation;

namespace SearchScorer.Common
{
public class GoogleAnalyticsSearchReferralsCsvReader
{
public static IReadOnlyDictionary<string, int> Read(string path)
{
using (var fileStream = File.OpenRead(path))
using (var streamReader = new StreamReader(fileStream))
using (var csvReader = new CsvReader(streamReader))
{
csvReader.Configuration.HasHeaderRecord = true;
csvReader.Configuration.IgnoreBlankLines = true;

var output = new Dictionary<string, int>();

csvReader.Read(); // comment
csvReader.Read(); // comment
csvReader.Read(); // comment
csvReader.Read(); // comment
csvReader.Read(); // comment
csvReader.Read(); // empty line
csvReader.ReadHeader();

while (csvReader.Read())
{
var landingPage = csvReader.GetField<string>("Landing Page");
var landingUri = new Uri("http://example" + landingPage);
var queryString = HttpUtility.ParseQueryString(landingUri.Query);

// Skip queries where we are not hitting the first page.
if (int.TryParse(queryString["page"], out var page) && page != 1)
{
continue;
}

var searchTerm = csvReader.GetField<string>("Search Term");
var sessions = int.Parse(csvReader.GetField<string>("Sessions").Replace(",", string.Empty));

if (output.TryGetValue(searchTerm, out var existingSessions))
{
output[searchTerm] += sessions;
}
else
{
output.Add(searchTerm, sessions);
}
}

return output;
}
}

private class Record
{
public string LandingPage { get; set; }
public string SearcTerm { get; set; }
public int Sessions { get; set; }
}
}

public static class CuratedSearchQueriesCsvReader
{
public static IReadOnlyList<CuratedSearchQuery> Read(string path)
{
using (var fileStream = File.OpenRead(path))
using (var streamReader = new StreamReader(fileStream))
using (var csvReader = new CsvReader(streamReader))
{
// Using case sensitive comparison, a search query should only appear once.
var caseSensitive = new HashSet<string>();

// Using case insensitive comparison, PackageIdX and ScoreX should only be set on the first search
// query in the file. It's a reasonable user expectation for search to be case insensitive. It is not
// in reality (we have camel case splitting) but the expected results for scoring purposes should be
// the same for all casings and the expected package IDs score only be defined once.
var existingScores = new Dictionary<string, IReadOnlyDictionary<string, int>>(StringComparer.OrdinalIgnoreCase);

var output = new List<CuratedSearchQuery>();
int lineNumber = 1; // The header is read automatically
foreach (var record in csvReader.GetRecords<Record>())
{
lineNumber++;

var searchQuery = record.SearchQuery.Trim();
if (!caseSensitive.Add(searchQuery))
{
throw new InvalidOperationException($"The search query '{searchQuery}' is a duplicate in file, line {lineNumber}: {path}");
}

var pairs = new[]
{
new { PackageId = record.PackageId0, Score = record.Score0 },
new { PackageId = record.PackageId1, Score = record.Score1 },
new { PackageId = record.PackageId2, Score = record.Score2 },
new { PackageId = record.PackageId3, Score = record.Score3 },
new { PackageId = record.PackageId4, Score = record.Score4 },
new { PackageId = record.PackageId5, Score = record.Score5 },
new { PackageId = record.PackageId6, Score = record.Score6 },
new { PackageId = record.PackageId7, Score = record.Score7 },
new { PackageId = record.PackageId8, Score = record.Score8 },
new { PackageId = record.PackageId9, Score = record.Score9 },
};

var packageIdToScore = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
foreach (var pair in pairs)
{
var packageId = pair.PackageId?.Trim();
if (!string.IsNullOrWhiteSpace(packageId))
{
if (packageIdToScore.ContainsKey(packageId))
{
throw new InvalidOperationException($"The package ID '{packageId}' is duplicate for search query '{searchQuery}' in file, line {lineNumber}: {path}");
}

if (string.IsNullOrWhiteSpace(pair.Score))
{
throw new InvalidOperationException($"The package ID '{packageId}' has a missing score for search query '{searchQuery}' in file, line {lineNumber}: {path}");
}

if (!int.TryParse(pair.Score.Trim(), out var score))
{
throw new InvalidOperationException($"The package ID '{packageId}' has an invalid score for search query '{searchQuery}' in file, line {lineNumber}: {path}");
}

if (score < 1 || score > RelevancyScoreBuilder.MaximumRelevancyScore)
{
throw new InvalidOperationException(
$"The package ID '{packageId}' has a score out of range [1, " +
$"{RelevancyScoreBuilder.MaximumRelevancyScore}] for search query " +
$"'{searchQuery}' in file, line {lineNumber}: {path}");
}

packageIdToScore.Add(packageId, score);
}
else
{
if (!string.IsNullOrWhiteSpace(pair.Score))
{
throw new InvalidOperationException($"There is a score without a package ID for search query '{searchQuery}' in file, line {lineNumber}: {path}");
}
}
}

if (existingScores.TryGetValue(searchQuery, out var existingPackageIdToScore))
{
if (packageIdToScore.Any())
{
throw new InvalidOperationException($"There scores for case insensitive search query '{searchQuery}' are defined multiple time in file, line {lineNumber}: {path}");
}

output.Add(new CuratedSearchQuery(
record.Source,
searchQuery,
existingPackageIdToScore));
}
else
{
if (packageIdToScore.Any())
{
existingScores.Add(searchQuery, packageIdToScore);

output.Add(new CuratedSearchQuery(
record.Source,
searchQuery,
packageIdToScore));
}
else
{
Console.WriteLine($"WARNING: Skipping search query '{searchQuery}' since it has no scores.");
}
}
}

return output;
}
}

private class Record
{
public SearchQuerySource Source { get; set; }
public string SearchQuery { get; set; }
public string PackageId0 { get; set; }
public string Score0 { get; set; }
public string PackageId1 { get; set; }
public string Score1 { get; set; }
public string PackageId2 { get; set; }
public string Score2 { get; set; }
public string PackageId3 { get; set; }
public string Score3 { get; set; }
public string PackageId4 { get; set; }
public string Score4 { get; set; }
public string PackageId5 { get; set; }
public string Score5 { get; set; }
public string PackageId6 { get; set; }
public string Score6 { get; set; }
public string PackageId7 { get; set; }
public string Score7 { get; set; }
public string PackageId8 { get; set; }
public string Score8 { get; set; }
public string PackageId9 { get; set; }
public string Score9 { get; set; }
}
}
}
19 changes: 19 additions & 0 deletions SearchScorer/SearchScorer/Common/CuratedSearchQuery.cs
@@ -0,0 +1,19 @@
using System.Collections.Generic;
using SearchScorer.Feedback;

namespace SearchScorer.Common
{
public class CuratedSearchQuery
{
public CuratedSearchQuery(SearchQuerySource source, string searchQuery, IReadOnlyDictionary<string, int> packageIdToScore)
{
Source = source;
SearchQuery = searchQuery;
PackageIdToScore = packageIdToScore;
}

public SearchQuerySource Source { get; }
public string SearchQuery { get; }
public IReadOnlyDictionary<string, int> PackageIdToScore { get; }
}
}
Expand Up @@ -10,7 +10,7 @@

namespace SearchScorer.Common
{
public static class FeedbackSearchQueryCsvReader
public static class FeedbackSearchQueriesCsvReader
{
public static IReadOnlyList<FeedbackSearchQuery> Read(string path)
{
Expand Down
80 changes: 80 additions & 0 deletions SearchScorer/SearchScorer/Common/TopSearchQueriesCsvReader.cs
@@ -0,0 +1,80 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using CsvHelper;

namespace SearchScorer.Common
{
public static class TopSearchQueriesCsvReader
{
/* This is the query that generates the data:
let minTimestamp = todatetime('2019-07-02T18:57:00Z');
customMetrics
| where timestamp > minTimestamp
| where name == "BrowserSearchPage"
| where customDimensions.PageIndex == 0
| extend Query = trim("\\s", tostring(customDimensions.SearchTerm))
| distinct Query, session_Id
| summarize QueryCount = count() by Query
| order by QueryCount desc
| take 10000
This query is an attempt to remove search queries where the first page in the session is the search query
indicating that it was a non-organic search.
let minTimestamp = todatetime('2019-07-02T18:57:00Z');
pageViews
| where timestamp > minTimestamp
| where session_Id != ""
| summarize min(timestamp), min(url) by session_Id
| project session_Id, firstPageViewTimestamp = min_timestamp, firstPageViewUrl = min_url
| join kind=inner (
pageViews
| where timestamp > minTimestamp
| where session_Id != ""
| extend parsedUrl = parse_url(url)
| where parsedUrl.Path == "/packages"
| extend searchQuery = url_decode(trim("\\s", tostring(parsedUrl["Query Parameters"]["q"])))
| extend page = tostring(parsedUrl["Query Parameters"]["page"])
| extend prerel = tolower(tostring(parsedUrl["Query Parameters"]["prerel"])) != "false"
| extend page = iff(page == "", 1, toint(page))
| where page > 0
| project session_Id, timestamp, searchQuery, page, prerel, url
) on session_Id
| project timestamp, session_Id, firstPageViewTimestamp, firstPageViewUrl, searchQuery, page, prerel, url
| join kind=innerunique (
customMetrics
| where timestamp > minTimestamp
| where name == "BrowserSearchPage"
| project session_Id
) on session_Id
| project timestamp, session_Id, firstPageViewTimestamp, firstPageViewUrl, searchQuery, page, prerel, url
| where page == 1
| where searchQuery != ""
| summarize searchCount = count(), nonLandingSearchCount = countif(timestamp != firstPageViewTimestamp) by searchQuery
| order by nonLandingSearchCount desc
| project Query = searchQuery, QueryCount = nonLandingSearchCount
| take 10000
*/

public static IReadOnlyDictionary<string, int> Read(string path)
{
using (var fileStream = File.OpenRead(path))
using (var streamReader = new StreamReader(fileStream))
using (var csvReader = new CsvReader(streamReader))
{
return csvReader
.GetRecords<Record>()
.ToDictionary(x => x.Query, x => x.QueryCount);
}
}

private class Record
{
public string Query { get; set; }
public int QueryCount { get; set; }
}
}
}
42 changes: 0 additions & 42 deletions SearchScorer/SearchScorer/Common/TopSearchQueryCsvReader.cs

This file was deleted.

Expand Up @@ -9,8 +9,9 @@ public static class TopSearchSelectionsCsvReader
{
/* This is the query that generates the data:
let minTimestamp = todatetime('2019-07-02T18:57:00Z');
customMetrics
| where timestamp > ago(90d)
| where timestamp > minTimestamp
| where name == "BrowserSearchSelection"
| extend SearchQuery = trim("\\s", tostring(customDimensions.SearchTerm))
| extend selectedPackageId = tolower(tostring(customDimensions.PackageId))
Expand Down

0 comments on commit 990ec69

Please sign in to comment.