Permalink
Browse files

Moving over to better word skipping detection. The old method was get…

…ting too convoluted and had various bugs. The new method will likely be a bit slower, but should be better about what words to skip. Also introduced unit tests.
  • Loading branch information...
Noah Richards
Noah Richards committed Apr 22, 2010
1 parent 0d7b897 commit cb178d7a1e647a498926153cccd0f6fd36ad006f
@@ -43,3 +43,4 @@
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
+[assembly: InternalsVisibleTo("SpellChecker.Test")]
@@ -65,12 +65,6 @@ public ITagSpan<MisspellingTag> ToTagSpan(ITextSnapshot snapshot)
sealed class SpellingTagger : ITagger<MisspellingTag>
{
- struct SpanToParse
- {
- public SnapshotSpan Span;
- public NormalizedSnapshotSpanCollection NaturalTextSpans;
- }
-
ITextBuffer _buffer;
ITagAggregator<INaturalTextTag> _naturalTextAggregator;
ITagAggregator<IUrlTag> _urlAggregator;
@@ -282,8 +276,6 @@ void StartUpdateThread(object sender, EventArgs e)
// Normalize the dirty spans
ITextSnapshot snapshot = _buffer.CurrentSnapshot;
var normalizedSpans = new NormalizedSnapshotSpanCollection(dirtySpans.Select(s => s.TranslateTo(snapshot, SpanTrackingMode.EdgeInclusive)));
- var spansToParse = normalizedSpans.Select(s => new SpanToParse() { Span = s, NaturalTextSpans = GetNaturalLanguageSpansForDirtySpan(s) })
- .ToList();
_updateThread = new Thread(GuardedCheckSpellings)
{
@@ -294,24 +286,24 @@ void StartUpdateThread(object sender, EventArgs e)
if (!_updateThread.TrySetApartmentState(ApartmentState.STA))
Debug.Fail("Unable to set thread apartment state to STA, things *will* break.");
- _updateThread.Start(spansToParse);
+ _updateThread.Start(normalizedSpans);
}
- void GuardedCheckSpellings(object spansToParseObject)
+ void GuardedCheckSpellings(object dirtySpansObject)
{
if (_isClosed)
return;
try
{
- IEnumerable<SpanToParse> spansToParse = spansToParseObject as IEnumerable<SpanToParse>;
- if (spansToParse == null)
+ IEnumerable<SnapshotSpan> dirtySpans = dirtySpansObject as IEnumerable<SnapshotSpan>;
+ if (dirtySpans == null)
{
Debug.Fail("Being asked to check a null list of dirty spans. What gives?");
return;
}
- CheckSpellings(spansToParse);
+ CheckSpellings(dirtySpans);
}
catch (Exception ex)
{
@@ -322,18 +314,23 @@ void GuardedCheckSpellings(object spansToParseObject)
}
}
- void CheckSpellings(IEnumerable<SpanToParse> spansToParse)
+ void CheckSpellings(IEnumerable<SnapshotSpan> dirtySpans)
{
TextBox textBox = new TextBox();
textBox.SpellCheck.IsEnabled = true;
ITextSnapshot snapshot = _buffer.CurrentSnapshot;
- foreach (var spanToParse in spansToParse)
+ foreach (var dirtySpan in dirtySpans)
{
- var dirty = spanToParse.Span.TranslateTo(snapshot, SpanTrackingMode.EdgeInclusive);
+ var dirty = dirtySpan.TranslateTo(snapshot, SpanTrackingMode.EdgeInclusive);
+
+ // We have to go back to the UI thread to get natural text spans
+ List<SnapshotSpan> naturalTextSpans = new List<SnapshotSpan>();
+ OnForegroundThread(() => naturalTextSpans = GetNaturalLanguageSpansForDirtySpan(dirty).ToList());
+
var naturalText = new NormalizedSnapshotSpanCollection(
- spanToParse.NaturalTextSpans.Select(span => span.TranslateTo(snapshot, SpanTrackingMode.EdgeInclusive)));
+ naturalTextSpans.Select(span => span.TranslateTo(snapshot, SpanTrackingMode.EdgeInclusive)));
List<MisspellingTag> currentMisspellings = new List<MisspellingTag>(_misspellings);
List<MisspellingTag> newMisspellings = new List<MisspellingTag>();
@@ -380,62 +377,9 @@ IEnumerable<MisspellingTag> GetMisspellingsInSpans(NormalizedSnapshotSpanCollect
{
string text = span.GetText();
- // We need to break this up for WPF, because it is *incredibly* slow at checking the spelling
- for (int i = 0; i < text.Length; i++)
+ foreach (var word in GetWordsInText(text))
{
- if (!IsSpellingWordChar(text[i]))
- continue;
-
- // We've found a word (or something), so search for the next piece of whitespace or punctuation to get the entire word span.
- // However, we will ignore words in a few cases:
- // 1) Words that are CamelCased, since those are probably not "real" words to begin with.
- // 2) Things that look like filenames (contain a "." followed by something other than a "."). We may miss a few "real" misspellings
- // here due to a missed space after a period, but that's acceptable.
- // 3) Words that include digits
- // 4) Words that include underscores
- // 5) Words in ALL CAPS
- int end = i;
- bool foundLower = false;
- bool ignoreWord = false;
- bool lastLetterWasADot = false;
-
- for (; end < text.Length; end++)
- {
- char c = text[end];
-
- if (!ignoreWord)
- {
- bool isUppercase = char.IsUpper(c);
-
- if (foundLower && isUppercase)
- ignoreWord = true;
- else if (c == '_')
- ignoreWord = true;
- else if (char.IsDigit(c))
- ignoreWord = true;
- else if (lastLetterWasADot && c != '.')
- ignoreWord = true;
-
- foundLower = foundLower || char.IsLower(c);
- lastLetterWasADot = (c == '.');
- }
-
- if (!IsSpellingWordChar(c))
- break;
- }
-
- // If this word is in ALL CAPS, ignore it
- if (!foundLower)
- ignoreWord = true;
-
- // Skip this word and move on to the next
- if (ignoreWord)
- {
- i = end - 1;
- continue;
- }
-
- string textToParse = text.Substring(i, end - i);
+ string textToParse = span.Snapshot.GetText(span.Start + word.Start, word.Length);
// Now pass these off to WPF.
textBox.Text = textToParse;
@@ -453,9 +397,9 @@ IEnumerable<MisspellingTag> GetMisspellingsInSpans(NormalizedSnapshotSpanCollect
if (nextChars.StartsWith("'s"))
length += 2;
- SnapshotSpan errorSpan = new SnapshotSpan(span.Snapshot, span.Start + i + nextSpellingErrorIndex, length);
+ SnapshotSpan errorSpan = new SnapshotSpan(span.Start + word.Start + nextSpellingErrorIndex, length);
- if (!_dictionary.ShouldIgnoreWord(errorSpan.GetText()))
+ if (ProbablyARealWord(errorSpan.GetText()) && !_dictionary.ShouldIgnoreWord(errorSpan.GetText()))
{
yield return new MisspellingTag(errorSpan, spellingError.Suggestions.ToArray());
}
@@ -464,19 +408,66 @@ IEnumerable<MisspellingTag> GetMisspellingsInSpans(NormalizedSnapshotSpanCollect
if (nextSearchIndex >= textToParse.Length)
break;
}
+ }
+ }
+ }
+
+ // Determine if the word is likely a real word, and not any of the following:
+ // 1) Words that are CamelCased, since those are probably not "real" words to begin with.
+ // 2) Things that look like filenames (contain a "." followed by something other than a "."). We may miss a few "real" misspellings
+ // here due to a missed space after a period, but that's acceptable.
+ // 3) Words that include digits
+ // 4) Words that include underscores
+ // 5) Words in ALL CAPS
+ static internal bool ProbablyARealWord(string word)
+ {
+ if (string.IsNullOrWhiteSpace(word))
+ return false;
+
+ word = word.Trim();
+
+ // Check digits/underscores
+ if (word.Any(c => c == '_' || char.IsDigit(c)))
+ return false;
+
+ // CamelCase/UPPER
+ char firstLetter = word.FirstOrDefault(c => char.IsLetter(c));
+ if (firstLetter != 0)
+ {
+ int toSkip = word.IndexOf(firstLetter);
+ if (toSkip >= 0 && toSkip < word.Length - 1 && word.Skip(toSkip + 1).Any(c => char.IsUpper(c)))
+ return false;
+ }
+
+ return true;
+ }
- // Move past this word
- i = end - 1;
+ static internal IEnumerable<Microsoft.VisualStudio.Text.Span> GetWordsInText(string text)
+ {
+ if (string.IsNullOrWhiteSpace(text))
+ yield break;
+
+ // We need to break this up for WPF, because it is *incredibly* slow at checking the spelling
+ for (int i = 0; i < text.Length; i++)
+ {
+ if (char.IsWhiteSpace(text[i]))
+ continue;
+
+ int end = i;
+ for (; end < text.Length; end++)
+ {
+ if (char.IsWhiteSpace(text[end]))
+ break;
}
+
+ yield return Microsoft.VisualStudio.Text.Span.FromBounds(i, end);
+ i = end - 1;
}
}
- /// <summary>
- /// Determine if the given character is a "spelling" word char, which includes a few more things than just characters
- /// </summary>
- bool IsSpellingWordChar(char c)
+ void OnForegroundThread(Action action, DispatcherPriority priority = DispatcherPriority.ApplicationIdle)
{
- return c == '\'' || c == '`' || c == '-' || c == '.' || char.IsLetter(c);
+ _dispatcher.Invoke(action, priority);
}
#endregion
View
@@ -5,7 +5,19 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker.Definitions",
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker", "SpellChecker.Implementation\SpellChecker.csproj", "{34115C0C-E2B4-42D7-9B23-F613C042B416}"
EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{B3E23353-454E-40CD-8871-7D2EA1EC6355}"
+ ProjectSection(SolutionItems) = preProject
+ Local.testsettings = Local.testsettings
+ SpellChecker.vsmdi = SpellChecker.vsmdi
+ TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings
+ EndProjectSection
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker.Test", "Test\SpellChecker.Test.csproj", "{C1EDBDEC-D389-4EB1-8768-1D6F37C0B203}"
+EndProject
Global
+ GlobalSection(TestCaseManagementSettings) = postSolution
+ CategoryFile = SpellChecker.vsmdi
+ EndGlobalSection
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
@@ -19,6 +31,10 @@ Global
{34115C0C-E2B4-42D7-9B23-F613C042B416}.Debug|Any CPU.Build.0 = Debug|Any CPU
{34115C0C-E2B4-42D7-9B23-F613C042B416}.Release|Any CPU.ActiveCfg = Release|Any CPU
{34115C0C-E2B4-42D7-9B23-F613C042B416}.Release|Any CPU.Build.0 = Release|Any CPU
+ {C1EDBDEC-D389-4EB1-8768-1D6F37C0B203}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {C1EDBDEC-D389-4EB1-8768-1D6F37C0B203}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {C1EDBDEC-D389-4EB1-8768-1D6F37C0B203}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {C1EDBDEC-D389-4EB1-8768-1D6F37C0B203}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -0,0 +1,35 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Test")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("Microsoft")]
+[assembly: AssemblyProduct("Test")]
+[assembly: AssemblyCopyright("Copyright © Microsoft 2010")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("b117041a-fbd9-4b62-945e-e366945a788a")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProductVersion>
+ </ProductVersion>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{C1EDBDEC-D389-4EB1-8768-1D6F37C0B203}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>SpellChecker.Test</RootNamespace>
+ <AssemblyName>SpellChecker.Test</AssemblyName>
+ <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ <ProjectTypeGuids>{3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="Microsoft.CSharp" />
+ <Reference Include="Microsoft.VisualStudio.CoreUtility, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.Language.Intellisense, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.Language.StandardClassification, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.QualityTools.UnitTestFramework, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.Text.Data, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.Text.Logic, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.Text.UI, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="Microsoft.VisualStudio.Text.UI.Wpf, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
+ <Reference Include="PresentationCore" />
+ <Reference Include="PresentationFramework" />
+ <Reference Include="System" />
+ <Reference Include="System.ComponentModel.Composition" />
+ <Reference Include="System.Core">
+ <RequiredTargetFramework>3.5</RequiredTargetFramework>
+ </Reference>
+ <Reference Include="System.Xaml" />
+ <Reference Include="WindowsBase" />
+ </ItemGroup>
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ <Compile Include="WordLogicTests.cs" />
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\SpellChecker.Definitions\SpellChecker.Definitions.csproj">
+ <Project>{87D22AC6-424B-48DD-A5E8-DCB7CB3DDD63}</Project>
+ <Name>SpellChecker.Definitions</Name>
+ </ProjectReference>
+ <ProjectReference Include="..\SpellChecker.Implementation\SpellChecker.csproj">
+ <Project>{34115C0C-E2B4-42D7-9B23-F613C042B416}</Project>
+ <Name>SpellChecker</Name>
+ </ProjectReference>
+ </ItemGroup>
+ <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
Oops, something went wrong.

1 comment on commit cb178d7

@kamaunyrroh

This comment has been minimized.

Show comment Hide comment
@kamaunyrroh

kamaunyrroh Jul 14, 2015

How does your spellchecker work. I want to install it and test it.

How does your spellchecker work. I want to install it and test it.

Please sign in to comment.