Skip to content

Commit

Permalink
Introduced per field analyzer for purpose of better Title field token…
Browse files Browse the repository at this point in the history
…ization. Motive - trying to get all my new tests to pass.
  • Loading branch information
Tim Lovell-Smith committed Dec 12, 2012
1 parent 84129e8 commit ae9ad9e
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 32 deletions.
19 changes: 9 additions & 10 deletions Website/Infrastructure/Lucene/LuceneIndexingService.cs
Expand Up @@ -16,7 +16,8 @@ public class LuceneIndexingService : IIndexingService
{
private static readonly object IndexWriterLock = new object();
private static readonly TimeSpan IndexRecreateInterval = TimeSpan.FromDays(3);
private static readonly char[] IdSeparators = new[] { '.', '-' };

internal static readonly char[] IdSeparators = new[] { '.', '-' };

private Lucene.Net.Store.Directory _directory;
private IndexWriter _indexWriter;
Expand Down Expand Up @@ -132,14 +133,12 @@ private void AddPackage(PackageIndexEntity package)
// Lucene's StandardTokenizer does not tokenize items of the format a.b.c which does not play well with things like "xunit.net".
// We will feed it values that are already tokenized.
var titleTokens = String.IsNullOrEmpty(package.Title)
? tokenizedId
: package.Title.Split(IdSeparators, StringSplitOptions.RemoveEmptyEntries);
foreach (var idToken in titleTokens)
{
field = new Field("Title", idToken, Field.Store.NO, Field.Index.ANALYZED);
field.SetBoost(0.9f);
document.Add(field);
}
? string.Join(" ", tokenizedId)
: package.Title;

field = new Field("Title", titleTokens, Field.Store.NO, Field.Index.ANALYZED);
field.SetBoost(0.9f);
document.Add(field);

if (!String.IsNullOrEmpty(package.Tags))
{
Expand Down Expand Up @@ -193,7 +192,7 @@ protected void EnsureIndexWriter(bool creatingIndex)

private void EnsureIndexWriterCore(bool creatingIndex)
{
var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
var analyzer = new PerFieldAnalyzer();
_indexWriter = new IndexWriter(_directory, analyzer, create: creatingIndex, mfl: IndexWriter.MaxFieldLength.UNLIMITED);

// Should always be add, due to locking
Expand Down
118 changes: 96 additions & 22 deletions Website/Infrastructure/Lucene/LuceneSearchService.cs
Expand Up @@ -13,6 +13,8 @@ public class LuceneSearchService : ISearchService
{
private Lucene.Net.Store.Directory _directory;

private static readonly string[] Fields = new[] { "Id", "Title", "Tags", "Description", "Author" };

public LuceneSearchService(Lucene.Net.Store.Directory directory)
{
_directory = directory;
Expand Down Expand Up @@ -97,9 +99,8 @@ private static Query ParseQuery(SearchFilter searchFilter)
return new MatchAllDocsQuery();
}

var fields = new[] { "Id", "Title", "Tags", "Description", "Author" };
var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
var queryParser = new MultiFieldQueryParser(LuceneCommon.LuceneVersion, fields, analyzer);
var analyzer = new PerFieldAnalyzer();
var queryParser = new MultiFieldQueryParser(LuceneCommon.LuceneVersion, Fields, analyzer);

// All terms in the multi-term query appear in at least one of the fields.
var conjuctionQuery = new BooleanQuery();
Expand All @@ -113,37 +114,110 @@ private static Query ParseQuery(SearchFilter searchFilter)
var wildCardQuery = new BooleanQuery();
wildCardQuery.SetBoost(0.5f);

// Escape the entire term we use for exact searches.
var escapedSearchTerm = Escape(searchFilter.SearchTerm);
var exactIdQuery = new TermQuery(new Term("Id-Exact", escapedSearchTerm));
exactIdQuery.SetBoost(2.5f);
var wildCardIdQuery = new WildcardQuery(new Term("Id-Exact", "*" + escapedSearchTerm + "*"));
// Cleanup the search terms, and analyze the user intent - is this an ID search?
bool specificallySearchingNonIdFields = false;
var sanitizedTerms = GetSanitizedTerms(searchFilter.SearchTerm, out specificallySearchingNonIdFields);

foreach (var term in GetSearchTerms(searchFilter.SearchTerm))
Query executionQuery = null;
if (specificallySearchingNonIdFields)
{
// Don't do exact ID search or wildcard ID search
// Don't do our fancy optimizations
// Just rely on Lucene Query parser to do the right thing
executionQuery = queryParser.Parse(searchFilter.SearchTerm);
}
else
{
var termQuery = queryParser.Parse(term);
conjuctionQuery.Add(termQuery, BooleanClause.Occur.MUST);
disjunctionQuery.Add(termQuery, BooleanClause.Occur.SHOULD);
// Escape the final term for exact ID search
string exactId = string.Join(" ", sanitizedTerms);
string escapedExactId = Escape(exactId);

var exactIdQuery = new TermQuery(new Term("Id-Exact", escapedExactId));
exactIdQuery.SetBoost(2.5f);

foreach (var field in fields)
var wildCardIdQuery = new WildcardQuery(new Term("Id-Exact", "*" + escapedExactId + "*"));

foreach (var term in GetSearchTerms(searchFilter.SearchTerm))
{
var wildCardTermQuery = new WildcardQuery(new Term(field, term + "*"));
wildCardTermQuery.SetBoost(0.7f);
wildCardQuery.Add(wildCardTermQuery, BooleanClause.Occur.SHOULD);
var termQuery = queryParser.Parse(term);
conjuctionQuery.Add(termQuery, BooleanClause.Occur.MUST);
disjunctionQuery.Add(termQuery, BooleanClause.Occur.SHOULD);

// It might have been a field specific query?
string justOneField = null;
if (termQuery is TermQuery)
{
justOneField = (termQuery as TermQuery).GetTerm().Field();
}

// Or it might not.
foreach (var field in (justOneField == null ? Fields : new[] { justOneField }))
{
var wildCardTermQuery = new WildcardQuery(new Term(field, term + "*"));
wildCardTermQuery.SetBoost(0.7f);
wildCardQuery.Add(wildCardTermQuery, BooleanClause.Occur.SHOULD);
}
}
}

// Create an OR of all the queries that we have
var combinedQuery =
conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery });
// Create an OR of all the queries that we have
executionQuery =
conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery });
}

if (searchFilter.SortProperty == SortProperty.Relevance)
{
// If searching by relevance, boost scores by download count.
var downloadCountBooster = new FieldScoreQuery("DownloadCount", FieldScoreQuery.Type.INT);
return new CustomScoreQuery(combinedQuery, downloadCountBooster);
return new CustomScoreQuery(executionQuery, downloadCountBooster);
}
else
{
return executionQuery;
}
return combinedQuery;
}

// Strip out LUCENE search syntax-isms.
// 'OR, 'AND', and '-[term]' get dropped
// '+[term]' and '[field]:[term]' get returned as 'term'
private static IEnumerable<string> GetSanitizedTerms(string searchTerm, out bool searchesNonIdFields)
{
List<String> ret = new List<string>();
searchesNonIdFields = false;
var parts = searchTerm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
foreach (var part in parts)
{
if (string.Equals(part, "OR", StringComparison.InvariantCultureIgnoreCase)
|| string.Equals(part, "AND", StringComparison.InvariantCultureIgnoreCase)
|| part.StartsWith("-", StringComparison.InvariantCultureIgnoreCase))
{
continue;
}

string p = part;
if (p.StartsWith("+"))
{
p = p.Substring(1);
}

foreach (var field in Fields)
{
if (p.StartsWith(field, StringComparison.InvariantCultureIgnoreCase)
&& p[field.Length] == ':')
{
if (field != "Id")
{
searchesNonIdFields = true;
}

p = p.Substring(field.Length + 1);
break;
}
}

ret.Add(p);
}

return ret;
}

private static IEnumerable<string> GetSearchTerms(string searchTerm)
Expand Down
41 changes: 41 additions & 0 deletions Website/Infrastructure/Lucene/PerFieldAnalyzer.cs
@@ -0,0 +1,41 @@
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;

namespace NuGetGallery
{
public class PerFieldAnalyzer : PerFieldAnalyzerWrapper
{
public PerFieldAnalyzer()
: base(new StandardAnalyzer(LuceneCommon.LuceneVersion), CreateFieldAnalyzers())
{
}

private static IDictionary CreateFieldAnalyzers()
{
return new Dictionary<string, Analyzer>
{
{ "Title", new TitleAnalyzer() }
};
}

class TitleAnalyzer : Analyzer
{
private Analyzer innerAnalyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);

public override TokenStream TokenStream(string fieldName, TextReader reader)
{
// Split the title based on IdSeparators, then run it through the standardAnalyzer
Debug.Assert(fieldName == "Title");
string title = reader.ReadToEnd();
string partiallyTokenized = string.Join(" ", title.Split(LuceneIndexingService.IdSeparators, StringSplitOptions.RemoveEmptyEntries));
return innerAnalyzer.TokenStream(fieldName, new StringReader(partiallyTokenized));
}
}
}
}
1 change: 1 addition & 0 deletions Website/Website.csproj
Expand Up @@ -302,6 +302,7 @@
<Compile Include="Infrastructure\Lucene\LuceneIndexingService.cs" />
<Compile Include="Infrastructure\Jobs\WorkItemCleanupJob.cs" />
<Compile Include="Infrastructure\Jobs\UpdateStatisticsJob.cs" />
<Compile Include="Infrastructure\Lucene\PerFieldAnalyzer.cs" />
<Compile Include="Infrastructure\Lucene\PackageIndexEntity.cs" />
<Compile Include="JsonApiController.generated.cs">
<DependentUpon>T4MVC.tt</DependentUpon>
Expand Down

0 comments on commit ae9ad9e

Please sign in to comment.