/
CsvLexer.flex
109 lines (91 loc) · 2.07 KB
/
CsvLexer.flex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
package net.seesharpsoft.intellij.plugins.csv;
import com.intellij.psi.tree.IElementType;
import net.seesharpsoft.intellij.plugins.csv.psi.CsvTypes;
import com.intellij.psi.TokenType;
import com.intellij.lexer.FlexLexer;
import java.util.regex.Pattern;
%%
%class CsvLexer
%implements FlexLexer
%unicode
%function advance
%type IElementType
%{
private CsvValueSeparator myValueSeparator;
private CsvEscapeCharacter myEscapeCharacter;
private static final Pattern ESCAPE_TEXT_PATTERN = Pattern.compile("[,;|\\t\\r\\n]");
/**
* Provide constructor that supports a Project as parameter.
*/
CsvLexer(java.io.Reader in, CsvValueSeparator valueSeparator, CsvEscapeCharacter escapeCharacter) {
this(in);
myValueSeparator = valueSeparator;
myEscapeCharacter = escapeCharacter;
}
%}
%eof{ return;
%eof}
TEXT=[^ ,;|\t\r\n\"\\]+
ESCAPED_TEXT=[,;|\t\r\n\\]|\"\"|\\\"
QUOTE=\"
COMMA=[,;|\t]
EOL=\n
WHITE_SPACE=[ \f]+
%state AFTER_TEXT
%state ESCAPED_TEXT
%state UNESCAPED_TEXT
%%
<YYINITIAL> {QUOTE}
{
yybegin(ESCAPED_TEXT);
return CsvTypes.QUOTE;
}
<ESCAPED_TEXT> {QUOTE}
{
yybegin(AFTER_TEXT);
return CsvTypes.QUOTE;
}
<YYINITIAL> {TEXT}
{
yybegin(UNESCAPED_TEXT);
return CsvTypes.TEXT;
}
<UNESCAPED_TEXT, ESCAPED_TEXT> {TEXT}
{
return CsvTypes.TEXT;
}
<ESCAPED_TEXT> {ESCAPED_TEXT}
{
String text = yytext().toString();
if (myEscapeCharacter.isEscapedQuote(text)
|| ESCAPE_TEXT_PATTERN.matcher(text).matches()
) {
return CsvTypes.ESCAPED_TEXT;
}
return TokenType.BAD_CHARACTER;
}
<YYINITIAL, AFTER_TEXT, UNESCAPED_TEXT> {COMMA}
{
if (myValueSeparator.isValueSeparator(yytext().toString())) {
yybegin(YYINITIAL);
return CsvTypes.COMMA;
}
if (yystate() != AFTER_TEXT) {
yybegin(UNESCAPED_TEXT);
return CsvTypes.TEXT;
}
return TokenType.BAD_CHARACTER;
}
<YYINITIAL, AFTER_TEXT, UNESCAPED_TEXT> {EOL}
{
yybegin(YYINITIAL);
return CsvTypes.CRLF;
}
{WHITE_SPACE}
{
return TokenType.WHITE_SPACE;
}
.
{
return TokenType.BAD_CHARACTER;
}