-
Notifications
You must be signed in to change notification settings - Fork 44
/
CsvLexer.flex
144 lines (123 loc) · 3.04 KB
/
CsvLexer.flex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
package net.seesharpsoft.intellij.plugins.csv;
import com.intellij.psi.tree.IElementType;
import net.seesharpsoft.intellij.plugins.csv.psi.CsvTypes;
import com.intellij.psi.TokenType;
import com.intellij.lexer.FlexLexer;import org.intellij.grammar.livePreview.LivePreviewElementType;
import java.util.regex.Pattern;
%%
%class CsvLexer
%implements FlexLexer
%unicode
%function advance
%type IElementType
%{
private CsvValueSeparator myValueSeparator;
private CsvEscapeCharacter myEscapeCharacter;
private static final Pattern ESCAPE_TEXT_PATTERN = Pattern.compile("[,;|\\t\\r\\n]");
/**
* Provide constructor that supports a Project as parameter.
*/
CsvLexer(java.io.Reader in, CsvValueSeparator valueSeparator, CsvEscapeCharacter escapeCharacter) {
this(in);
myValueSeparator = valueSeparator;
myEscapeCharacter = escapeCharacter;
}
%}
%eof{ return;
%eof}
TEXT=[^ ,;|\t\r\n\"\\]+
ESCAPED_TEXT=[,;|\t\r\n]|\"\"|\\\"
ESCAPE_CHAR=\\
QUOTE=\"
COMMA=[,;|\t]
EOL=\n
WHITE_SPACE=[ \f]+
%state AFTER_TEXT
%state ESCAPED_TEXT
%state UNESCAPED_TEXT
%state ESCAPING
%%
<YYINITIAL> {QUOTE}
{
yybegin(ESCAPED_TEXT);
return CsvTypes.QUOTE;
}
<ESCAPED_TEXT> {QUOTE}
{
yybegin(AFTER_TEXT);
return CsvTypes.QUOTE;
}
<YYINITIAL> {TEXT}
{
yybegin(UNESCAPED_TEXT);
return CsvTypes.TEXT;
}
<UNESCAPED_TEXT, ESCAPED_TEXT> {TEXT}
{
return CsvTypes.TEXT;
}
<YYINITIAL, UNESCAPED_TEXT> {ESCAPE_CHAR}
{
String text = yytext().toString();
if (myEscapeCharacter.getCharacter().equals(text)) {
return TokenType.BAD_CHARACTER;
}
yybegin(UNESCAPED_TEXT);
return CsvTypes.TEXT;
}
<ESCAPED_TEXT, ESCAPING> {ESCAPE_CHAR} {
String text = yytext().toString();
if (myEscapeCharacter.getCharacter().equals(text)) {
switch (yystate()) {
case ESCAPED_TEXT:
yybegin(ESCAPING);
break;
case ESCAPING:
yybegin(ESCAPED_TEXT);
break;
default:
throw new RuntimeException("unhandled state: " + yystate());
}
return CsvTypes.ESCAPED_TEXT;
}
return CsvTypes.TEXT;
}
<ESCAPED_TEXT> {ESCAPED_TEXT}
{
String text = yytext().toString();
if (myEscapeCharacter.isEscapedQuote(text)
|| ESCAPE_TEXT_PATTERN.matcher(text).matches()
) {
return CsvTypes.ESCAPED_TEXT;
}
if (!text.startsWith(CsvEscapeCharacter.QUOTE.getCharacter())) {
yypushback(1);
return CsvTypes.TEXT;
}
return TokenType.BAD_CHARACTER;
}
<YYINITIAL, AFTER_TEXT, UNESCAPED_TEXT> {COMMA}
{
if (myValueSeparator.isValueSeparator(yytext().toString())) {
yybegin(YYINITIAL);
return CsvTypes.COMMA;
}
if (yystate() != AFTER_TEXT) {
yybegin(UNESCAPED_TEXT);
return CsvTypes.TEXT;
}
return TokenType.BAD_CHARACTER;
}
<YYINITIAL, AFTER_TEXT, UNESCAPED_TEXT> {EOL}
{
yybegin(YYINITIAL);
return CsvTypes.CRLF;
}
{WHITE_SPACE}
{
return TokenType.WHITE_SPACE;
}
.
{
return TokenType.BAD_CHARACTER;
}