Permalink
Browse files

* Store id tokens in title if no title is available

* Add download count as a scoring factor for relevance
* Tweak boosts for description and disjunction queries
  • Loading branch information...
1 parent b798101 commit 278979217a65cf398f40f6394978183a6766887c @pranavkm pranavkm committed Jun 29, 2012
@@ -11,19 +11,19 @@ namespace NuGetGallery.Infrastructure
public class LuceneIndexingServiceFacts
{
[Theory]
- [InlineData("NHibernate", new string[0])]
- [InlineData("NUnit", new string[0])]
- [InlineData("EntityFramework", new[] { "Framework", "Entity" })]
- [InlineData("Sys-netFX", new[] { "Sys", "netFX" })]
- [InlineData("xUnit", new string[0])]
- [InlineData("jQueryUI", new string[0])]
- [InlineData("jQuery-UI", new[] { "jQuery", "UI" })]
- [InlineData("NuGetPowerTools", new[] { "NuGet", "Power", "Tools" } )]
- [InlineData("microsoft-web-helpers", new[] { "microsoft", "web", "helpers" } )]
- [InlineData("EntityFramework.sample", new[] { "EntityFramework", "sample", "Framework", "Entity" })]
- [InlineData("SignalR.MicroSliver", new[] { "SignalR", "MicroSliver", "Micro", "Sliver" })]
- [InlineData("ABCMicroFramework", new[] { "ABC", "Micro", "Framework" })]
- [InlineData("SignalR.Hosting.AspNet", new[] { "SignalR", "Hosting", "AspNet", "Asp", "Net"})]
+ [InlineData("NHibernate", new[] { "NHibernate" })]
+ [InlineData("NUnit", new[] { "NUnit" })]
+ [InlineData("EntityFramework", new[] { "EntityFramework", "Framework", "Entity" })]
+ [InlineData("Sys-netFX", new[] { "Sys-netFX", "Sys", "netFX" })]
+ [InlineData("xUnit", new[] { "xUnit" })]
+ [InlineData("jQueryUI", new [] { "jQueryUI" })]
+ [InlineData("jQuery-UI", new[] { "jQuery-UI", "jQuery", "UI" })]
+ [InlineData("NuGetPowerTools", new[] { "NuGetPowerTools", "NuGet", "Power", "Tools" } )]
+ [InlineData("microsoft-web-helpers", new[] { "microsoft-web-helpers", "microsoft", "web", "helpers" } )]
+ [InlineData("EntityFramework.sample", new[] { "EntityFramework.sample", "EntityFramework", "sample", "Framework", "Entity" })]
+ [InlineData("SignalR.MicroSliver", new[] { "SignalR.MicroSliver", "SignalR", "MicroSliver", "Micro", "Sliver" })]
+ [InlineData("ABCMicroFramework", new[] { "ABCMicroFramework", "ABC", "Micro", "Framework" })]
+ [InlineData("SignalR.Hosting.AspNet", new[] { "SignalR.Hosting.AspNet", "SignalR", "Hosting", "AspNet", "Asp", "Net"})]
public void CamelCaseTokenizer(string term, IEnumerable<string> tokens)
{
// Act
@@ -89,19 +89,25 @@ private static void AddPackages(IndexWriter indexWriter, List<PackageIndexEntity
var document = new Document();
document.Add(new Field("Key", package.Key.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.NO));
- document.Add(new Field("Id-Exact", package.Id, Field.Store.NO, Field.Index.ANALYZED));
+ document.Add(new Field("Id-Exact", package.Id.ToLowerInvariant(), Field.Store.NO, Field.Index.NOT_ANALYZED));
document.Add(new Field("Description", package.Description, Field.Store.NO, Field.Index.ANALYZED));
- foreach (var idToken in TokenizeId(package.Id))
+ var tokenizedId = TokenizeId(package.Id);
+ foreach (var idToken in tokenizedId)
{
document.Add(new Field("Id", idToken, Field.Store.NO, Field.Index.ANALYZED));
}
- if (!String.IsNullOrEmpty(package.Title))
+ // If an element does not have a Title, then add all the tokenized Id components as Title.
+ // Lucene's StandardTokenizer does not tokenize items of the format a.b.c which does not play well with things like "xunit.net".
+ // We will feed it values that are already tokenized.
+ var titleTokens = String.IsNullOrEmpty(package.Title) ? tokenizedId : package.Title.Split(idSeparators, StringSplitOptions.RemoveEmptyEntries);
+ foreach (var idToken in titleTokens)
{
- document.Add(new Field("Title", package.Title, Field.Store.NO, Field.Index.ANALYZED));
+ document.Add(new Field("Title", idToken, Field.Store.NO, Field.Index.ANALYZED));
}
+
if (!String.IsNullOrEmpty(package.Tags))
{
document.Add(new Field("Tags", package.Tags, Field.Store.NO, Field.Index.ANALYZED));
@@ -176,15 +182,10 @@ internal static IEnumerable<string> TokenizeId(string term)
var tokens = term.Split(idSeparators, StringSplitOptions.RemoveEmptyEntries);
// For each token, further attempt to tokenize camelcase values. e.g. .EventStream -> Event, Stream.
- // Skip the exact term since we index it indep
- var result = tokens.Concat(tokens.SelectMany(CamelCaseTokenize))
+ var result = tokens.Concat(new[] { term })
+ .Concat(tokens.SelectMany(CamelCaseTokenize))
.Distinct(StringComparer.OrdinalIgnoreCase)
- .Where(t => !term.Equals(t))
.ToList();
- if (result.Count == 1)
- {
- return Enumerable.Empty<string>();
- }
return result;
}
@@ -7,6 +7,7 @@
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
+using Lucene.Net.Search.Function;
namespace NuGetGallery
{
@@ -86,27 +87,28 @@ private static IEnumerable<int> SearchCore(string searchTerm)
private static Query ParseQuery(string searchTerm)
{
- var fields = new Dictionary<string, float> { { "Id", 1.2f }, { "Title", 1.0f }, { "Tags", 0.8f }, { "Description", 0.3f },
+ var fields = new Dictionary<string, float> { { "Id", 1.2f }, { "Title", 1.0f }, { "Tags", 0.8f }, { "Description", 0.1f },
{ "Author", 1.0f } };
var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
searchTerm = QueryParser.Escape(searchTerm).ToLowerInvariant();
var queryParser = new MultiFieldQueryParser(LuceneCommon.LuceneVersion, fields.Keys.ToArray(), analyzer, fields);
var conjuctionQuery = new BooleanQuery();
- conjuctionQuery.SetBoost(1.2f);
+ conjuctionQuery.SetBoost(2.0f);
var disjunctionQuery = new BooleanQuery();
- disjunctionQuery.SetBoost(0.3f);
+ disjunctionQuery.SetBoost(0.1f);
var wildCardQuery = new BooleanQuery();
wildCardQuery.SetBoost(0.5f);
var exactIdQuery = new TermQuery(new Term("Id-Exact", searchTerm));
exactIdQuery.SetBoost(2.5f);
var wildCardIdQuery = new WildcardQuery(new Term("Id-Exact", "*" + searchTerm + "*"));
-
- foreach (var term in searchTerm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries))
+
+ foreach(var term in GetSearchTerms(searchTerm))
{
- conjuctionQuery.Add(queryParser.Parse(term), BooleanClause.Occur.MUST);
- disjunctionQuery.Add(queryParser.Parse(term), BooleanClause.Occur.SHOULD);
+ var termQuery = queryParser.Parse(term);
+ conjuctionQuery.Add(termQuery, BooleanClause.Occur.MUST);
+ disjunctionQuery.Add(termQuery, BooleanClause.Occur.SHOULD);
foreach (var field in fields)
{
@@ -116,7 +118,16 @@ private static Query ParseQuery(string searchTerm)
}
}
- return conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery });
+ var downloadCountBooster = new FieldScoreQuery("DownloadCount", FieldScoreQuery.Type.INT);
+ return new CustomScoreQuery(conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery }),
+ downloadCountBooster);
+ }
+
+ private static IEnumerable<string> GetSearchTerms(string searchTerm)
+ {
+ return searchTerm.Split(new[] { ' ', '.', '-' }, StringSplitOptions.RemoveEmptyEntries)
+ .Concat(new[] { searchTerm })
+ .Distinct(StringComparer.OrdinalIgnoreCase);
}
}
}

0 comments on commit 2789792

Please sign in to comment.