-
-
Notifications
You must be signed in to change notification settings - Fork 659
/
MarkdownParser.cs
291 lines (244 loc) · 14.2 KB
/
MarkdownParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using DiscordChatExporter.Core.Markdown.Ast;
using DiscordChatExporter.Core.Markdown.Matching;
namespace DiscordChatExporter.Core.Markdown
{
// The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
internal static partial class MarkdownParser
{
private const RegexOptions DefaultRegexOptions =
RegexOptions.Compiled |
RegexOptions.CultureInvariant |
RegexOptions.Multiline;
/* Formatting */
// Capture any character until the earliest double asterisk not followed by an asterisk
private static readonly IMatcher<MarkdownNode> BoldFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Bold, Parse(p.Slice(m.Groups[1])))
);
// Capture any character until the earliest single asterisk not preceded or followed by an asterisk
// Opening asterisk must not be followed by whitespace
// Closing asterisk must not be preceded by whitespace
private static readonly IMatcher<MarkdownNode> ItalicFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1])))
);
// Capture any character until the earliest triple asterisk not followed by an asterisk
private static readonly IMatcher<MarkdownNode> ItalicBoldFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattedNodeMatcher))
);
// Capture any character except underscore until an underscore
// Closing underscore must not be followed by a word character
private static readonly IMatcher<MarkdownNode> ItalicAltFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1])))
);
// Capture any character until the earliest double underscore not followed by an underscore
private static readonly IMatcher<MarkdownNode> UnderlineFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Underline, Parse(p.Slice(m.Groups[1])))
);
// Capture any character until the earliest triple underscore not followed by an underscore
private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattedNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Italic,
Parse(p.Slice(m.Groups[1]), UnderlineFormattedNodeMatcher))
);
// Capture any character until the earliest double tilde
private static readonly IMatcher<MarkdownNode> StrikethroughFormattedNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Strikethrough, Parse(p.Slice(m.Groups[1])))
);
// Capture any character until the earliest double pipe
private static readonly IMatcher<MarkdownNode> SpoilerFormattedNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Spoiler, Parse(p.Slice(m.Groups[1])))
);
// Capture any character until the end of the line
// Opening 'greater than' character must be followed by whitespace
private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("^>\\s(.+\n?)", DefaultRegexOptions),
(p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1])))
);
// Repeatedly capture any character until the end of the line
// This one is tricky as it ends up producing multiple separate captures which need to be joined
private static readonly IMatcher<MarkdownNode> RepeatedSingleLineQuoteNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex("(?:^>\\s(.+\n?)){2,}", DefaultRegexOptions),
(_, m) =>
{
var content = string.Concat(m.Groups[1].Captures.Select(c => c.Value));
return new FormattedNode(TextFormatting.Quote, Parse(content));
}
);
// Capture any character until the end of the input
// Opening 'greater than' characters must be followed by whitespace
private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
(p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1])))
);
/* Code blocks */
// Capture any character except backtick until a backtick
// Blank lines at the beginning and end of content are trimmed
// There can be either one or two backticks, but equal number on both sides
private static readonly IMatcher<MarkdownNode> InlineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("(`{1,2})([^`]+)\\1", DefaultRegexOptions | RegexOptions.Singleline),
m => new InlineCodeBlockNode(m.Groups[2].Value.Trim('\r', '\n'))
);
// Capture language identifier and then any character until the earliest triple backtick
// Language identifier is one word immediately after opening backticks, followed immediately by newline
// Blank lines at the beginning and end of content are trimmed
private static readonly IMatcher<MarkdownNode> MultiLineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
m => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))
);
/* Mentions */
// Capture @everyone
private static readonly IMatcher<MarkdownNode> EveryoneMentionNodeMatcher = new StringMatcher<MarkdownNode>(
"@everyone",
_ => new MentionNode("everyone", MentionType.Meta)
);
// Capture @here
private static readonly IMatcher<MarkdownNode> HereMentionNodeMatcher = new StringMatcher<MarkdownNode>(
"@here",
_ => new MentionNode("here", MentionType.Meta)
);
// Capture <@123456> or <@!123456>
private static readonly IMatcher<MarkdownNode> UserMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<@!?(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Groups[1].Value, MentionType.User)
);
// Capture <#123456>
private static readonly IMatcher<MarkdownNode> ChannelMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<#(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Groups[1].Value, MentionType.Channel)
);
// Capture <@&123456>
private static readonly IMatcher<MarkdownNode> RoleMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<@&(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Groups[1].Value, MentionType.Role)
);
/* Emojis */
// Capture any country flag emoji (two regional indicator surrogate pairs)
// ... or "miscellaneous symbol" character
// ... or surrogate pair
// ... or digit followed by enclosing mark
// (this does not match all emojis in Discord but it's reasonably accurate enough)
private static readonly IMatcher<MarkdownNode> StandardEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|[\\u2600-\\u26FF]|\\p{Cs}{2}|\\d\\p{Me})",
DefaultRegexOptions),
m => new EmojiNode(m.Groups[1].Value)
);
// Capture <:lul:123456> or <a:lul:123456>
private static readonly IMatcher<MarkdownNode> CustomEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions),
m => new EmojiNode(m.Groups[3].Value, m.Groups[2].Value, !string.IsNullOrWhiteSpace(m.Groups[1].Value))
);
/* Links */
// Capture [title](link)
private static readonly IMatcher<MarkdownNode> TitledLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
m => new LinkNode(m.Groups[2].Value, m.Groups[1].Value)
);
// Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace
private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions),
m => new LinkNode(m.Groups[1].Value)
);
// Same as auto link but also surrounded by angular brackets
private static readonly IMatcher<MarkdownNode> HiddenLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions),
m => new LinkNode(m.Groups[1].Value)
);
/* Text */
// Capture the shrug emoticon
// This escapes it from matching for formatting
private static readonly IMatcher<MarkdownNode> ShrugTextNodeMatcher = new StringMatcher<MarkdownNode>(
@"¯\_(ツ)_/¯",
p => new TextNode(p.ToString())
);
// Capture some specific emojis that don't get rendered
// This escapes it from matching for emoji
private static readonly IMatcher<MarkdownNode> IgnoredEmojiTextNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions),
m => new TextNode(m.Groups[1].Value)
);
// Capture any "symbol/other" character or surrogate pair preceded by a backslash
// This escapes it from matching for emoji
private static readonly IMatcher<MarkdownNode> EscapedSymbolTextNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions),
m => new TextNode(m.Groups[1].Value)
);
// Capture any non-whitespace, non latin alphanumeric character preceded by a backslash
// This escapes it from matching for formatting or other tokens
private static readonly IMatcher<MarkdownNode> EscapedCharacterTextNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions),
m => new TextNode(m.Groups[1].Value)
);
// Combine all matchers into one
// Matchers that have similar patterns are ordered from most specific to least specific
private static readonly IMatcher<MarkdownNode> AggregateNodeMatcher = new AggregateMatcher<MarkdownNode>(
// Escaped text
ShrugTextNodeMatcher,
IgnoredEmojiTextNodeMatcher,
EscapedSymbolTextNodeMatcher,
EscapedCharacterTextNodeMatcher,
// Formatting
ItalicBoldFormattedNodeMatcher,
ItalicUnderlineFormattedNodeMatcher,
BoldFormattedNodeMatcher,
ItalicFormattedNodeMatcher,
UnderlineFormattedNodeMatcher,
ItalicAltFormattedNodeMatcher,
StrikethroughFormattedNodeMatcher,
SpoilerFormattedNodeMatcher,
MultiLineQuoteNodeMatcher,
RepeatedSingleLineQuoteNodeMatcher,
SingleLineQuoteNodeMatcher,
// Code blocks
MultiLineCodeBlockNodeMatcher,
InlineCodeBlockNodeMatcher,
// Mentions
EveryoneMentionNodeMatcher,
HereMentionNodeMatcher,
UserMentionNodeMatcher,
ChannelMentionNodeMatcher,
RoleMentionNodeMatcher,
// Links
TitledLinkNodeMatcher,
AutoLinkNodeMatcher,
HiddenLinkNodeMatcher,
// Emoji
StandardEmojiNodeMatcher,
CustomEmojiNodeMatcher
);
// Minimal set of matchers for non-multimedia formats (e.g. plain text)
private static readonly IMatcher<MarkdownNode> MinimalAggregateNodeMatcher = new AggregateMatcher<MarkdownNode>(
// Mentions
EveryoneMentionNodeMatcher,
HereMentionNodeMatcher,
UserMentionNodeMatcher,
ChannelMentionNodeMatcher,
RoleMentionNodeMatcher,
// Emoji
CustomEmojiNodeMatcher
);
private static IReadOnlyList<MarkdownNode> Parse(StringPart stringPart, IMatcher<MarkdownNode> matcher) =>
matcher
.MatchAll(stringPart, p => new TextNode(p.ToString()))
.Select(r => r.Value)
.ToArray();
}
internal static partial class MarkdownParser
{
private static IReadOnlyList<MarkdownNode> Parse(StringPart stringPart) => Parse(stringPart, AggregateNodeMatcher);
private static IReadOnlyList<MarkdownNode> ParseMinimal(StringPart stringPart) => Parse(stringPart, MinimalAggregateNodeMatcher);
public static IReadOnlyList<MarkdownNode> Parse(string input) => Parse(new StringPart(input));
public static IReadOnlyList<MarkdownNode> ParseMinimal(string input) => ParseMinimal(new StringPart(input));
}
}