Introduced per field analyzer for purpose of better Title field token…

…ization. Motive - trying to get all my new tests to pass.
NuGet · Dec 12, 2012 · ae9ad9e · ae9ad9e
1 parent 84129e8
commit ae9ad9e
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 32 deletions.
diff --git a/Website/Infrastructure/Lucene/LuceneIndexingService.cs b/Website/Infrastructure/Lucene/LuceneIndexingService.cs
@@ -16,7 +16,8 @@ public class LuceneIndexingService : IIndexingService
     {
         private static readonly object IndexWriterLock = new object();
         private static readonly TimeSpan IndexRecreateInterval = TimeSpan.FromDays(3);
-        private static readonly char[] IdSeparators = new[] { '.', '-' };
+
+        internal static readonly char[] IdSeparators = new[] { '.', '-' };
 
         private Lucene.Net.Store.Directory _directory;
         private IndexWriter _indexWriter;
@@ -132,14 +133,12 @@ private void AddPackage(PackageIndexEntity package)
             // Lucene's StandardTokenizer does not tokenize items of the format a.b.c which does not play well with things like "xunit.net". 
             // We will feed it values that are already tokenized.
             var titleTokens = String.IsNullOrEmpty(package.Title)
-                                  ? tokenizedId
-                                  : package.Title.Split(IdSeparators, StringSplitOptions.RemoveEmptyEntries);
-            foreach (var idToken in titleTokens)
-            {
-                field = new Field("Title", idToken, Field.Store.NO, Field.Index.ANALYZED);
-                field.SetBoost(0.9f);
-                document.Add(field);
-            }
+                                  ? string.Join(" ", tokenizedId)
+                                  : package.Title;
+
+            field = new Field("Title", titleTokens, Field.Store.NO, Field.Index.ANALYZED);
+            field.SetBoost(0.9f);
+            document.Add(field);
 
             if (!String.IsNullOrEmpty(package.Tags))
             {
@@ -193,7 +192,7 @@ protected void EnsureIndexWriter(bool creatingIndex)
 
         private void EnsureIndexWriterCore(bool creatingIndex)
         {
-            var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
+            var analyzer = new PerFieldAnalyzer();
             _indexWriter = new IndexWriter(_directory, analyzer, create: creatingIndex, mfl: IndexWriter.MaxFieldLength.UNLIMITED);
 
             // Should always be add, due to locking

diff --git a/Website/Infrastructure/Lucene/LuceneSearchService.cs b/Website/Infrastructure/Lucene/LuceneSearchService.cs
@@ -13,6 +13,8 @@ public class LuceneSearchService : ISearchService
     {
         private Lucene.Net.Store.Directory _directory;
 
+        private static readonly string[] Fields = new[] { "Id", "Title", "Tags", "Description", "Author" };
+
         public LuceneSearchService(Lucene.Net.Store.Directory directory)
         {
             _directory = directory;
@@ -97,9 +99,8 @@ private static Query ParseQuery(SearchFilter searchFilter)
                 return new MatchAllDocsQuery();
             }
 
-            var fields = new[] { "Id", "Title", "Tags", "Description", "Author" };
-            var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
-            var queryParser = new MultiFieldQueryParser(LuceneCommon.LuceneVersion, fields, analyzer);
+            var analyzer = new PerFieldAnalyzer();
+            var queryParser = new MultiFieldQueryParser(LuceneCommon.LuceneVersion, Fields, analyzer);
 
             // All terms in the multi-term query appear in at least one of the fields.
             var conjuctionQuery = new BooleanQuery();
@@ -113,37 +114,110 @@ private static Query ParseQuery(SearchFilter searchFilter)
             var wildCardQuery = new BooleanQuery();
             wildCardQuery.SetBoost(0.5f);
 
-            // Escape the entire term we use for exact searches.
-            var escapedSearchTerm = Escape(searchFilter.SearchTerm);
-            var exactIdQuery = new TermQuery(new Term("Id-Exact", escapedSearchTerm));
-            exactIdQuery.SetBoost(2.5f);
-            var wildCardIdQuery = new WildcardQuery(new Term("Id-Exact", "*" + escapedSearchTerm + "*"));
+            // Cleanup the search terms, and analyze the user intent - is this an ID search?
+            bool specificallySearchingNonIdFields = false;
+            var sanitizedTerms = GetSanitizedTerms(searchFilter.SearchTerm, out specificallySearchingNonIdFields);
 
-            foreach (var term in GetSearchTerms(searchFilter.SearchTerm))
+            Query executionQuery = null;
+            if (specificallySearchingNonIdFields)
+            {
+                // Don't do exact ID search or wildcard ID search
+                // Don't do our fancy optimizations
+                // Just rely on Lucene Query parser to do the right thing
+                executionQuery = queryParser.Parse(searchFilter.SearchTerm);
+            }
+            else 
             {
-                var termQuery = queryParser.Parse(term);
-                conjuctionQuery.Add(termQuery, BooleanClause.Occur.MUST);
-                disjunctionQuery.Add(termQuery, BooleanClause.Occur.SHOULD);
+                // Escape the final term for exact ID search
+                string exactId = string.Join(" ", sanitizedTerms);
+                string escapedExactId = Escape(exactId);
+
+                var exactIdQuery = new TermQuery(new Term("Id-Exact", escapedExactId));
+                exactIdQuery.SetBoost(2.5f);
 
-                foreach (var field in fields)
+                var wildCardIdQuery = new WildcardQuery(new Term("Id-Exact", "*" + escapedExactId + "*"));
+
+                foreach (var term in GetSearchTerms(searchFilter.SearchTerm))
                 {
-                    var wildCardTermQuery = new WildcardQuery(new Term(field, term + "*"));
-                    wildCardTermQuery.SetBoost(0.7f);
-                    wildCardQuery.Add(wildCardTermQuery, BooleanClause.Occur.SHOULD);
+                    var termQuery = queryParser.Parse(term);
+                    conjuctionQuery.Add(termQuery, BooleanClause.Occur.MUST);
+                    disjunctionQuery.Add(termQuery, BooleanClause.Occur.SHOULD);
+
+                    // It might have been a field specific query?
+                    string justOneField = null;
+                    if (termQuery is TermQuery)
+                    {
+                        justOneField = (termQuery as TermQuery).GetTerm().Field();
+                    }
+
+                    // Or it might not.
+                    foreach (var field in (justOneField == null ? Fields : new[] { justOneField }))
+                    {
+                        var wildCardTermQuery = new WildcardQuery(new Term(field, term + "*"));
+                        wildCardTermQuery.SetBoost(0.7f);
+                        wildCardQuery.Add(wildCardTermQuery, BooleanClause.Occur.SHOULD);
+                    }
                 }
-            }
 
-            // Create an OR of all the queries that we have
-            var combinedQuery =
-                conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery });
+                // Create an OR of all the queries that we have
+                executionQuery = 
+                    conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery });
+            }
 
             if (searchFilter.SortProperty == SortProperty.Relevance)
             {
                 // If searching by relevance, boost scores by download count.
                 var downloadCountBooster = new FieldScoreQuery("DownloadCount", FieldScoreQuery.Type.INT);
-                return new CustomScoreQuery(combinedQuery, downloadCountBooster);
+                return new CustomScoreQuery(executionQuery, downloadCountBooster);
+            }
+            else
+            {
+                return executionQuery;
             }
-            return combinedQuery;
+        }
+
+        // Strip out LUCENE search syntax-isms.
+        // 'OR, 'AND', and '-[term]' get dropped
+        // '+[term]' and '[field]:[term]' get returned as 'term'
+        private static IEnumerable<string> GetSanitizedTerms(string searchTerm, out bool searchesNonIdFields)
+        {
+            List<String> ret = new List<string>();
+            searchesNonIdFields = false;
+            var parts = searchTerm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+            foreach (var part in parts)
+            {
+                if (string.Equals(part, "OR", StringComparison.InvariantCultureIgnoreCase)
+                    || string.Equals(part, "AND", StringComparison.InvariantCultureIgnoreCase)
+                    || part.StartsWith("-", StringComparison.InvariantCultureIgnoreCase))
+                {
+                    continue;
+                }
+
+                string p = part;
+                if (p.StartsWith("+"))
+                {
+                    p = p.Substring(1);
+                }
+
+                foreach (var field in Fields)
+                {
+                    if (p.StartsWith(field, StringComparison.InvariantCultureIgnoreCase) 
+                        && p[field.Length] == ':')
+                    {
+                        if (field != "Id")
+                        {
+                            searchesNonIdFields = true;
+                        }
+
+                        p = p.Substring(field.Length + 1);
+                        break;
+                    }
+                }
+
+                ret.Add(p);
+            }
+
+            return ret;
         }
 
         private static IEnumerable<string> GetSearchTerms(string searchTerm)

diff --git a/Website/Infrastructure/Lucene/PerFieldAnalyzer.cs b/Website/Infrastructure/Lucene/PerFieldAnalyzer.cs
@@ -0,0 +1,41 @@
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+
+namespace NuGetGallery
+{
+    public class PerFieldAnalyzer : PerFieldAnalyzerWrapper
+    {
+        public PerFieldAnalyzer()
+            : base(new StandardAnalyzer(LuceneCommon.LuceneVersion), CreateFieldAnalyzers())
+        {
+        }
+
+        private static IDictionary CreateFieldAnalyzers()
+        {
+            return new Dictionary<string, Analyzer>
+            {
+                { "Title", new TitleAnalyzer() }
+            };
+        }
+
+        class TitleAnalyzer : Analyzer
+        {
+            private Analyzer innerAnalyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion);
+
+            public override TokenStream TokenStream(string fieldName, TextReader reader)
+            {
+                // Split the title based on IdSeparators, then run it through the standardAnalyzer
+                Debug.Assert(fieldName == "Title");
+                string title = reader.ReadToEnd();
+                string partiallyTokenized = string.Join(" ", title.Split(LuceneIndexingService.IdSeparators, StringSplitOptions.RemoveEmptyEntries));
+                return innerAnalyzer.TokenStream(fieldName, new StringReader(partiallyTokenized));
+            }
+        }
+    }
+}
diff --git a/Website/Website.csproj b/Website/Website.csproj
@@ -302,6 +302,7 @@
     <Compile Include="Infrastructure\Lucene\LuceneIndexingService.cs" />
     <Compile Include="Infrastructure\Jobs\WorkItemCleanupJob.cs" />
     <Compile Include="Infrastructure\Jobs\UpdateStatisticsJob.cs" />
+    <Compile Include="Infrastructure\Lucene\PerFieldAnalyzer.cs" />
     <Compile Include="Infrastructure\Lucene\PackageIndexEntity.cs" />
     <Compile Include="JsonApiController.generated.cs">
       <DependentUpon>T4MVC.tt</DependentUpon>