Skip to content

Loading…

Perform wild card queries on full ids. ... #449

Closed
wants to merge 1 commit into from

2 participants

@pranavkm
NuGet member

Tokenize search using Id tokenizer

Work Item: #448

@half-ogre

I like this change, but I think we should open an issue for Test explaining why we did it and what we changed so they can test it thoroughly.

@pranavkm
NuGet member

Pulling this in as part of my relevance changes

@pranavkm pranavkm closed this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Mar 7, 2012
  1. @pranavkm
This page is out of date. Refresh to see the latest.
View
39 Website/Infrastructure/Lucene/LuceneFieldTokenizer.cs
@@ -0,0 +1,39 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace NuGetGallery
+{
+ public static class LuceneIdTokenizer
+ {
+ private static readonly char[] idSeparators = new[] { '.', '-', ' ' };
+
+ public static IList<string> Tokenize(string term)
+ {
+ return TokenizeCamelCase(term).SelectMany(s => s.Split(idSeparators, StringSplitOptions.RemoveEmptyEntries)).ToList();
+ }
+
+ internal static IEnumerable<string> TokenizeCamelCase(string term)
+ {
+ if (term.Length < 2)
+ {
+ yield break;
+ }
+
+ int tokenStart = 0;
+ for (int i = 1; i < term.Length; i++)
+ {
+ if (Char.IsUpper(term[i]) && (i - tokenStart > 2))
+ {
+ yield return term.Substring(tokenStart, i - tokenStart);
+ tokenStart = i;
+ }
+ }
+ if (term.Length - tokenStart < 2)
+ {
+ yield break;
+ }
+ yield return term.Substring(tokenStart);
+ }
+ }
+}
View
30 Website/Infrastructure/Lucene/LuceneIndexingService.cs
@@ -15,7 +15,6 @@ namespace NuGetGallery
public class LuceneIndexingService : IIndexingService
{
private static readonly TimeSpan indexRecreateTime = TimeSpan.FromDays(3);
- private static readonly char[] idSeparators = new[] { '.', '-' };
public void UpdateIndex()
{
@@ -132,7 +131,7 @@ protected internal virtual void EnsureIndexDirectory()
return dateTime;
}
}
-
+
return null;
}
@@ -160,7 +159,7 @@ protected internal virtual void UpdateLastWriteTime()
if (Directory.Exists(LuceneCommon.IndexMetadataPath))
{
// If the directoey exists, then assume that the index has been created.
- File.WriteAllText(LuceneCommon.IndexMetadataPath, DateTime.UtcNow.ToString("R"));
+ File.WriteAllText(LuceneCommon.IndexMetadataPath, DateTime.UtcNow.ToString("R", CultureInfo.InvariantCulture));
}
}
else
@@ -171,35 +170,12 @@ protected internal virtual void UpdateLastWriteTime()
internal static IEnumerable<string> TokenizeId(string term)
{
- var result = CamelCaseTokenize(term).SelectMany(s => s.Split(idSeparators, StringSplitOptions.RemoveEmptyEntries)).ToList();
+ var result = LuceneIdTokenizer.Tokenize(term);
if (result.Count == 1)
{
return Enumerable.Empty<string>();
}
return result;
}
-
- private static IEnumerable<string> CamelCaseTokenize(string term)
- {
- if (term.Length < 2)
- {
- yield break;
- }
-
- int tokenStart = 0;
- for (int i = 1; i < term.Length; i++)
- {
- if (Char.IsUpper(term[i]) && (i - tokenStart > 2))
- {
- yield return term.Substring(tokenStart, i - tokenStart);
- tokenStart = i;
- }
- }
- if (term.Length - tokenStart < 2)
- {
- yield break;
- }
- yield return term.Substring(tokenStart);
- }
}
}
View
16 Website/Infrastructure/Lucene/LuceneSearchService.cs
@@ -1,12 +1,13 @@
using System;
using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
using System.Linq;
using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
-using Lucene.Net.Index;
namespace NuGetGallery
{
@@ -84,9 +85,10 @@ private static IEnumerable<int> SearchCore(string searchTerm)
}
}
+ [SuppressMessage("Microsoft.Globalization", "CA1308:NormalizeStringsToUppercase", Justification = "Search index is in lower case")]
private static Query ParseQuery(string searchTerm)
{
- var fields = new Dictionary<string, float> { { "Id", 1.2f }, { "Title", 1.0f }, { "Tags", 1.0f}, { "Description", 0.8f }, { "Author", 0.6f } };
+ var fields = new Dictionary<string, float> { { "Id", 1.2f }, { "Title", 1.0f }, { "Tags", 1.0f }, { "Description", 0.8f }, { "Author", 0.6f }, { "Id-Exact", 2.0f } };
var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
searchTerm = QueryParser.Escape(searchTerm).ToLowerInvariant();
@@ -97,14 +99,12 @@ private static Query ParseQuery(string searchTerm)
var disjunctionQuery = new BooleanQuery();
var wildCardQuery = new BooleanQuery();
wildCardQuery.SetBoost(0.7f);
- var exactIdQuery = new TermQuery(new Term("Id-Exact", searchTerm));
- exactIdQuery.SetBoost(2.5f);
-
- foreach(var term in searchTerm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries))
+
+ foreach (var term in LuceneIdTokenizer.Tokenize(searchTerm))
{
conjuctionQuery.Add(queryParser.Parse(term), BooleanClause.Occur.MUST);
disjunctionQuery.Add(queryParser.Parse(term), BooleanClause.Occur.SHOULD);
-
+
foreach (var field in fields)
{
var wildCardTermQuery = new WildcardQuery(new Term(field.Key, term + "*"));
@@ -113,7 +113,7 @@ private static Query ParseQuery(string searchTerm)
}
}
- return conjuctionQuery.Combine(new Query[] { exactIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery });
+ return conjuctionQuery.Combine(new Query[] { conjuctionQuery, disjunctionQuery, wildCardQuery });
}
}
}
View
1 Website/Website.csproj
@@ -190,6 +190,7 @@
<Compile Include="Infrastructure\HttpHeaderValueProvider.cs" />
<Compile Include="Infrastructure\HttpHeaderValueProviderFactory.cs" />
<Compile Include="Infrastructure\HttpStatusCodeWithBodyResult.cs" />
+ <Compile Include="Infrastructure\Lucene\LuceneFieldTokenizer.cs" />
<Compile Include="Infrastructure\Lucene\LuceneFileSystem.cs" />
<Compile Include="Infrastructure\Lucene\LuceneCommon.cs" />
<Compile Include="Infrastructure\Lucene\LuceneIndexingJob.cs" />
Something went wrong with that request. Please try again.