Skip to content

Commit

Permalink
Implement improvements suggested in #61
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriele-tomassetti committed Apr 29, 2024
1 parent 3b0f3b3 commit 338b14b
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 25 deletions.
1 change: 1 addition & 0 deletions docfx_project/articles/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file.
- Added fixes from latest updates of Readability up until April 2024
- Fix parsing of JSON-LD element
- Fix issue #60, unexpected exception thrown for forbidden content (thanks to [doggy8088 ](https://github.com/doggy8088))
- Added performance improvements suggested to conversion to plain text (thanks to [malv007](https://github.com/malv007))

## 0.9.4 - 2023/08/27
- Fix issue #58, data URIs in IMG SRC not preserved, treated as relative URL (thanks to [Acidus](https://github.com/acidus99))
Expand Down
27 changes: 11 additions & 16 deletions src/SmartReader/Article.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public bool Completed
public static Func<IElement, string> Serializer { get; set; } = new Func<IElement, string>(el => el.InnerHtml);

/// <summary>The function that will extract the text from the HTML content</summary>
/// <value>Default: return InnerHTML property</value>
/// <value>Default: the ConvertToPlaintext method</value>
public static Func<IElement, string> Converter { get; set; } = ConvertToPlaintext;

private readonly IElement? _element = null;
Expand Down Expand Up @@ -99,6 +99,9 @@ public string TextContent
/// <value>The length in chars of <c>TextContent</c></value>
public int Length => TextContent.Length;

private static readonly Regex RE_EliminateTabs = new Regex("\t+", RegexOptions.Compiled);
private static readonly Regex RE_NormalizeNewLines = new Regex("(\\r?\\n){3,}", RegexOptions.Compiled);


internal Article(Uri uri, string title, string? byline, string? dir, string? language, string? author, IElement element, Metadata metadata, bool readable, Reader reader)
{
Expand Down Expand Up @@ -250,22 +253,18 @@ public async Task ConvertImagesToDataUriAsync(long minSize = 75000)
/// </returns>
private static string ConvertToPlaintext(IElement doc)
{
var writer = new StringWriter();
var sb = new StringBuilder();

var sb = new StringBuilder();

string text = ConvertToText(doc, sb);
ConvertToText(doc, sb);

bool previousSpace = false;
bool previousNewline = false;
int index = 0;

string text = sb.ToString();
// fix whitespace
// replace tabs with one space
text = Regex.Replace(text, "\t+", " ");

// replace multiple newlines with max two
text = Regex.Replace(text, "(\\r?\\n){3,}", $"{writer.NewLine}{writer.NewLine}");
text = RE_EliminateTabs.Replace(text, " ");

var stringBuilder = new StringBuilder(text);

Expand Down Expand Up @@ -295,21 +294,19 @@ private static string ConvertToPlaintext(IElement doc)
text = stringBuilder.ToString().Trim();

// replace multiple newlines with max two
text = Regex.Replace(text, "(\\r?\\n){3,}", $"{writer.NewLine}{writer.NewLine}");

writer.Dispose();
text = RE_NormalizeNewLines.Replace(text, $"{Environment.NewLine}{Environment.NewLine}");

return text;
}

/// <summary>
/// The function that converts HTML markup to text
/// </summary>
public static string ConvertToText(IElement doc, StringBuilder text)
private static void ConvertToText(IElement doc, StringBuilder text)
{
if (doc.NodeType == NodeType.Element && doc.NodeName is "P" or "BR")
{
text.AppendLine();
text.AppendLine();
}

if (doc.HasChildNodes)
Expand All @@ -332,8 +329,6 @@ public static string ConvertToText(IElement doc, StringBuilder text)

if (doc.NodeType is NodeType.Element && doc.NodeName is "P")
text.AppendLine();

return text.ToString();
}
}
}
24 changes: 15 additions & 9 deletions src/SmartReaderTests/BasicTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -350,15 +350,21 @@ public void TestPlaintextConversion()
{
// creating element
var parser = new HtmlParser(new HtmlParserOptions());
var doc = parser.ParseDocument(@"<html>
<head></head>
<body>
<p> </p>
<p>This is a paragraph with some text.</p>
<p>This is a paragraph with some other text and lots of whitespace .</p>
<p>This is a paragraph with different<br> other text.</p>
</body>
</html>");
var text = "<html>\r\n" +
"<head></head>\r\n" +
"<body>\r\n" +
" <p> </p>\r\n" +
" <p>This is a paragraph with some text.</p>\r\n" +
"\r\n" +
" <p>This is a paragraph with some other text and lots of whitespace .</p>\r\n" +
"\r\n" +
"\r\n" +
"\r\n" +
" <p>This is a paragraph with different<br> other text.</p>\r\n" +
"</body>\r\n" +
"</html>";

var doc = parser.ParseDocument(text);

var reader = new Reader("https://localhost/article");

Expand Down

0 comments on commit 338b14b

Please sign in to comment.