Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add alternatives key for listing other possible language matches #19

Merged
merged 14 commits into from Jun 29, 2023
3 changes: 3 additions & 0 deletions changelog.md
@@ -1,5 +1,8 @@
# Changelog

## Next
- Add `alternatives` key to list all possible language matches for files that do not have a definite match.

## 2.5.6
*2023-06-28*
- Changed fetching of data files to fallback to using the packaged files if the fetch request fails ([#21](https://github.com/Nixinova/LinguistJS/issues/21)).
Expand Down
2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -13,7 +13,7 @@
"scripts": {
"download-files": "npx tsx@3 build/download-files",
"pre-publish": "npm run download-files && npm test && npm run test:perf",
"test:perf": "tsc && node test/perf",
"perf": "tsc && node test/perf",
"test": "tsc && node test/folder && echo --- && node test/unit"
},
"files": [
Expand Down
20 changes: 13 additions & 7 deletions readme.md
Expand Up @@ -38,29 +38,35 @@ As an example, take the following file structure:
| | index.ts 2kB
| readme.md 3kB
| no-lang 10B
| x.pluginspec 10B
```

Running LinguistJS on this folder will return the following JSON:

```json
{
"files": {
"count": 4,
"bytes": 6010,
"count": 5,
"bytes": 6020,
"results": {
"/src/index.ts": "TypeScript",
"/src/cli.js": "JavaScript",
"/readme.md": "Markdown",
"/no-lang": null,
}
"/x.pluginspec": "Ruby",
},
"alternatives": {
".pluginspec": ["XML"],
},
},
"languages": {
"count": 3,
"bytes": 6000,
"bytes": 6010,
"results": {
"JavaScript": { "type": "programming", "bytes": 1000, "color": "#f1e05a" },
"TypeScript": { "type": "programming", "bytes": 2000, "color": "#2b7489" },
"Markdown": { "type": "prose", "bytes": 3000, "color": "#083fa1" },
"JavaScript": { "type": "programming", "bytes": 1000, "color": "#f1e05a" },
"Markdown": { "type": "prose", "bytes": 3000, "color": "#083fa1" },
"Ruby": { "type": "programming", "bytes": 10, "color": "#701516" },
"TypeScript": { "type": "programming", "bytes": 2000, "color": "#2b7489" },
},
},
"unknown": {
Expand Down
25 changes: 21 additions & 4 deletions src/index.ts
Expand Up @@ -33,7 +33,7 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
const extensions: Record<T.FilePath, string> = {};
const overrides: Record<T.FilePath, T.LanguageResult> = {};
const results: T.Results = {
files: { count: 0, bytes: 0, results: {} },
files: { count: 0, bytes: 0, results: {}, alternatives: {} },
languages: { count: 0, bytes: 0, results: {} },
unknown: { count: 0, bytes: 0, extensions: {}, filenames: {} },
};
Expand Down Expand Up @@ -157,8 +157,11 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
fileAssociations[file] = [];
extensions[file] = '';
}
const parent = !opts.childLanguages && result && langData[result].group || false;
fileAssociations[file].push(parent || result);
// Set parent to result group if it is present
// Is nullish if either `opts.childLanguages` is set or if there is no group
const finalResult = !opts.childLanguages && result && langData[result].group || result;
if (!fileAssociations[file].includes(finalResult))
fileAssociations[file].push(finalResult);
extensions[file] = paths.extname(file).toLowerCase();
};
const overridesArray = Object.entries(overrides);
Expand Down Expand Up @@ -279,12 +282,14 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
if (Array.isArray(heuristic.language)) {
heuristic.language = heuristic.language[0];
}

// Make sure the results includes this language
const languageGroup = langData[heuristic.language]?.group;
const matchesLang = fileAssociations[file].includes(heuristic.language);
const matchesParent = languageGroup && fileAssociations[file].includes(languageGroup);
if (!matchesLang && !matchesParent)
continue;

// Normalise heuristic data
const patterns: string[] = [];
const normalise = (contents: string | string[]) => patterns.push(...[contents].flat());
Expand All @@ -296,17 +301,29 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
if (data.named_pattern) normalise(heuristicsData.named_patterns[data.named_pattern]);
}
}

// Check file contents and apply heuristic patterns
const fileContent = opts.fileContent?.length ? opts.fileContent[files.indexOf(file)] : await readFile(file).catch(() => null);
// Skip if file read errors
if (fileContent === null) continue;
// Apply heuristics
if (!patterns.length || patterns.some(pattern => pcre(pattern).test(fileContent))) {
results.files.results[file] = heuristic.language;
break;
}
}
}
// If no heuristics, assign a language
results.files.results[file] ??= fileAssociations[file][0];
if (!results.files.results[file]) {
const possibleLangs = fileAssociations[file];
// Assign first language as a default option
const defaultLang = possibleLangs[0];
const alternativeLangs = possibleLangs.slice(1)
results.files.results[file] = defaultLang;
// List alternative languages if there are any
if (alternativeLangs.length > 0)
results.files.alternatives[file] = alternativeLangs;
}
}

// Skip specified categories
Expand Down
1 change: 1 addition & 0 deletions src/types.ts
Expand Up @@ -29,6 +29,7 @@ export interface Results {
bytes: Bytes
/** Note: Results use slashes as delimiters even on Windows. */
results: Record<FilePath, LanguageResult>
alternatives: Record<FilePath, LanguageResult[]>
}
languages: {
count: Integer
Expand Down
13 changes: 9 additions & 4 deletions test/expected.json
@@ -1,22 +1,27 @@
{
"files": {
"count": 8,
"bytes": 47,
"count": 9,
"bytes": 61,
"results": {
"~/al.al": "Perl",
"~/alternatives.asc": "AGS Script",
"~/file.txt": "JavaScript",
"~/folder/sub.txt": "Text",
"~/hashbang": "JavaScript",
"~/modeline.txt": "C++",
"~/Pipfile": "TOML",
"~/unknown": null
},
"alternatives": {
"~/alternatives.asc": [ "AsciiDoc", "Public Key" ]
}
},
"languages": {
"count": 5,
"bytes": 38,
"count": 6,
"bytes": 52,
"results": {
"Perl": { "type": "programming", "bytes": 0, "color": "#0298c3" },
"AGS Script": { "type": "programming", "bytes": 14, "color": "#B9D9FF" },
"JavaScript": { "type": "programming", "bytes": 23, "color": "#f1e05a" },
"Text": { "type": "prose", "bytes": 0 },
"C++": { "type": "programming", "bytes": 15, "color": "#f34b7d" },
Expand Down
4 changes: 2 additions & 2 deletions test/perf.js
Expand Up @@ -5,14 +5,14 @@ async function perfTest() {
const amount = +process.argv[2] || 75;
for (let i = 0; i < amount; i++) {
let t1 = +new Date();
await linguist('.');
await linguist('.', { offline: true });
let t2 = +new Date();
time += t2 - t1;
}
const unit = 'ms';
const total = time;
const average = total / amount;
const EXPECTED_MAX = 75; // 2.3
const EXPECTED_MAX = 100; // 2.6
console.log('\n<Performance test results>');
console.log('Total:', total, unit, `(n=${amount})`);
console.log('Average:', average, unit);
Expand Down
1 change: 1 addition & 0 deletions test/samples/alternatives.asc
@@ -0,0 +1 @@
Alternatives
2 changes: 2 additions & 0 deletions test/unit.js
Expand Up @@ -13,6 +13,7 @@ async function test([filename, fileContent = ''], [type, testVal]) {
'files': actual.files.results[filename],
'size': actual.files.bytes,
'count': actual.files.count,
'alternatives_count': Object.entries(actual.files.alternatives).length,
}[type];
const result = testContent === testVal;
i = `${+i + 1}`.padStart(2, '0');
Expand All @@ -36,6 +37,7 @@ async function unitTest() {
await test(['x.cpp'], ['files', 'C++']);
await test(['x.c'], ['files', 'C']);
await test(['x.R'], ['files', 'R']);
await test(['.m'], ['alternatives_count', 1])
desc('filenames');
await test(['Dockerfile'], ['files', 'Dockerfile']);
await test(['CMakeLists.txt'], ['files', 'CMake']);
Expand Down