Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JVM] Improve handling of unicode numbers #779

Merged
merged 2 commits into from
Oct 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 45 additions & 16 deletions src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.lang.Character;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
Expand Down Expand Up @@ -4085,28 +4086,30 @@ public static SixModelObject radix(long radix, String str, long zpos, long flags
int chars_really_converted = chars_converted;
long pos = -1;
char ch;
int char_value = -2;
boolean neg = false;

if (radix > 36) {
throw ExceptionHandling.dieInternal(tc, "Cannot convert radix of " + radix + " (max 36)");
}

ch = (zpos < chars) ? str.charAt((int)zpos) : 0;

/* flag 0x02 asks for parsing a leading +/-.
* We allow both, "HYPHEN-MINUS" and "MINUS SIGN", for negation. */
if ((flags & 0x02) != 0 && (ch == '+' || ch == '-' || ch == '−')) {
neg = (ch == '-' || ch == '−');
zpos++;
ch = (zpos < chars) ? str.charAt((int)zpos) : 0;
}

while (zpos < chars) {
if (ch >= '0' && ch <= '9') ch = (char)(ch - '0');
else if (ch >= 'a' && ch <= 'z') ch = (char)(ch - 'a' + 10);
else if (ch >= 'A' && ch <= 'Z') ch = (char)(ch - 'A' + 10);
else break;
if (ch >= radix) break;
zvalue = zvalue * radix + ch;
char_value = Character.digit(ch, (int)radix);
if (char_value == -1) break;
zvalue = zvalue * radix + char_value;
chars_converted++;
zpos++; pos = zpos;
if (ch != 0 || (flags & 0x04) == 0) { value=zvalue; chars_really_converted=chars_converted; }
if (char_value != 0 || (flags & 0x04) == 0) { value=zvalue; chars_really_converted=chars_converted; }
if (zpos >= chars) break;
ch = str.charAt((int)zpos);
if (ch != '_') continue;
Expand Down Expand Up @@ -7324,6 +7327,7 @@ public static SixModelObject radix_I(long radix_l, String str, long zpos, long f
int chars_really_converted = chars_converted;
long pos = -1;
char ch;
int char_value = -2;
boolean neg = false;
BigInteger radix = BigInteger.valueOf(radix_l);

Expand All @@ -7332,21 +7336,22 @@ public static SixModelObject radix_I(long radix_l, String str, long zpos, long f
}

ch = (zpos < chars) ? str.charAt((int)zpos) : 0;

/* flag 0x02 asks for parsing a leading +/-.
* We allow both, "HYPHEN-MINUS" and "MINUS SIGN", for negation. */
if ((flags & 0x02) != 0 && (ch == '+' || ch == '-' || ch == '−')) {
neg = (ch == '-' || ch == '−');
zpos++;
ch = (zpos < chars) ? str.charAt((int)zpos) : 0;
}

while (zpos < chars) {
if (ch >= '0' && ch <= '9') ch = (char)(ch - '0');
else if (ch >= 'a' && ch <= 'z') ch = (char)(ch - 'a' + 10);
else if (ch >= 'A' && ch <= 'Z') ch = (char)(ch - 'A' + 10);
else break;
if (ch >= radix_l) break;
zvalue = zvalue.multiply(radix).add(BigInteger.valueOf(ch));
char_value = Character.digit(ch, (int)radix_l);
if (char_value == -1) break;
zvalue = zvalue.multiply(radix).add(BigInteger.valueOf(char_value));
chars_converted++;
zpos++; pos = zpos;
if (ch != 0 || (flags & 0x04) == 0) { value=zvalue; chars_really_converted=chars_converted; }
if (char_value != 0 || (flags & 0x04) == 0) { value=zvalue; chars_really_converted=chars_converted; }
if (zpos >= chars) break;
ch = str.charAt((int)zpos);
if (ch != '_') continue;
Expand Down Expand Up @@ -7643,12 +7648,36 @@ else if ((0xE000 <= codePoint && codePoint <= 0xF8FF)
return name;
}

private static final int UNIPROP_NUMERIC_VALUE_NUMERATOR = 19;
private static final int UNIPROP_NUMERIC_VALUE_DENOMINATOR = 10;

/* TODO: Make this handle more properties. */
public static String getuniprop_str(long codepoint, long property, ThreadContext tc) {
return "";
String res = "";
if (property == UNIPROP_NUMERIC_VALUE_NUMERATOR || property == UNIPROP_NUMERIC_VALUE_DENOMINATOR) {
/* NFKD will decompose fractions into numerator and denominator,
* separated by "FRACTION SLASH" (\u2044). */
String[] fraction = Normalizer.normalize(Character.toString((char)codepoint),
Normalizer.Form.NFKD).split("\u2044");
if (property == UNIPROP_NUMERIC_VALUE_DENOMINATOR) {
res = fraction.length == 2 ? fraction[1] : "1";
} else {
res = fraction[0];
}
}
return res;
}

/* TODO: Make this handle more properties. */
public static long unipropcode(String prop, ThreadContext tc) {
return -1;
switch (prop) {
case "Numeric_Value_Numerator":
return UNIPROP_NUMERIC_VALUE_NUMERATOR;
case "Numeric_Value_Denominator":
return UNIPROP_NUMERIC_VALUE_DENOMINATOR;
default:
return -1;
}
}

public static SixModelObject force_gc(ThreadContext tc) {
Expand Down
16 changes: 6 additions & 10 deletions t/nqp/081-radix.t
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,15 @@ test_radix_I(10,"9883481620585741369158_914214988194663201633129_269524237910230
test_radix_both(3,"3",0,2, 0,0,-1, "no digits consumed with digit outside radix");
test_radix_both(3,"۳",0,2, 0,0,-1, "no digits consumed with unicode digit outside radix");

if nqp::getcomp('nqp').backend.name eq 'jvm' {
skip("radix can't yet handle fancy unicode stuff on the jvm", 4*(6 + 3));
} else {
test_radix_both(10,"۳",0,2, 3,1,1, "extended arabic-indic digit three");
test_radix_both(10,"۳۳",0,2, 33,2,2, "extended arabic-indic digit three");
test_radix_both(10,"۳",0,2, 3,1,1, "extended arabic-indic digit three");
test_radix_both(10,"۳۳",0,2, 33,2,2, "extended arabic-indic digit three");

my $full_width_capital := "\c[FULLWIDTH LATIN CAPITAL LETTER C]\c[FULLWIDTH LATIN CAPITAL LETTER A]\c[FULLWIDTH LATIN CAPITAL LETTER F]\c[FULLWIDTH LATIN CAPITAL LETTER E]";
my $full_width_capital := "\c[FULLWIDTH LATIN CAPITAL LETTER C]\c[FULLWIDTH LATIN CAPITAL LETTER A]\c[FULLWIDTH LATIN CAPITAL LETTER F]\c[FULLWIDTH LATIN CAPITAL LETTER E]";

my $full_width_small := "\c[FULLWIDTH LATIN SMALL LETTER C]\c[FULLWIDTH LATIN SMALL LETTER A]\c[FULLWIDTH LATIN SMALL LETTER F]\c[FULLWIDTH LATIN SMALL LETTER E]";
my $full_width_small := "\c[FULLWIDTH LATIN SMALL LETTER C]\c[FULLWIDTH LATIN SMALL LETTER A]\c[FULLWIDTH LATIN SMALL LETTER F]\c[FULLWIDTH LATIN SMALL LETTER E]";

test_radix_both(16,$full_width_capital,0,2, 51966,4,4, "fullwidth capital letters");
test_radix_both(16,$full_width_small,0,2, 51966,4,4, "fullwidth small letters");
}
test_radix_both(16,$full_width_capital,0,2, 51966,4,4, "fullwidth capital letters");
test_radix_both(16,$full_width_small,0,2, 51966,4,4, "fullwidth small letters");

test_radix_both(8,"8238321",0,2, 0,0,-1, "all digits outside of radix");
test_radix_both(8,"1838321",0,2, 1,1,1, "all but one digits outside of radix");
Expand Down