diff --git a/python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java b/python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java index 28e25cf9e3..e21974ea54 100644 --- a/python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java +++ b/python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java @@ -99,7 +99,7 @@ private static PyFile parseAs(String content, LanguageLevel languageLevel) { } @NotNull - private static String normalizeEol(String content) { + public static String normalizeEol(String content) { return content.replaceAll("\\r\\n?", "\n"); } diff --git a/python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java b/python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java index a866d5bba0..0633d02dd1 100644 --- a/python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java +++ b/python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java @@ -30,12 +30,13 @@ public class PythonTokenLocation { private final int endLineOffset; public PythonTokenLocation(@NotNull PsiElement element) { - Document psiDocument = element.getContainingFile().getViewProvider().getDocument(); - int startOffset = element.getTextRange().getStartOffset(); + this(element.getTextRange().getStartOffset(), element.getTextRange().getEndOffset(), element.getContainingFile().getViewProvider().getDocument()); + } + + public PythonTokenLocation(int startOffset, int endOffset, Document psiDocument) { startLine = psiDocument.getLineNumber(startOffset); int startLineNumberOffset = psiDocument.getLineStartOffset(startLine); startLineOffset = startOffset - startLineNumberOffset; - int endOffset = element.getTextRange().getEndOffset(); endLine = psiDocument.getLineNumber(endOffset); int endLineNumberOffset = psiDocument.getLineStartOffset(endLine); endLineOffset = endOffset - endLineNumberOffset; diff --git a/sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java b/sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java index 0fcdf9eaa4..76197387bd 100644 --- a/sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java +++ b/sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java @@ -95,7 +95,7 @@ private void scanFile(InputFile inputFile) { try { visitorContext = new PythonVisitorContext(parser.parse(fileContent), pythonFile); pyFile = new org.sonar.python.frontend.PythonParser().parse(fileContent); - saveMeasures(inputFile, visitorContext, pyFile); + saveMeasures(inputFile, visitorContext, pyFile, fileContent); } catch (RecognitionException e) { visitorContext = new PythonVisitorContext(pythonFile, e); LOG.error("Unable to parse file: " + inputFile.toString()); @@ -162,12 +162,12 @@ private static NewIssueLocation newLocation(InputFile inputFile, NewIssue issue, return newLocation; } - private void saveMeasures(InputFile inputFile, PythonVisitorContext visitorContext, PyFile pyFile) { + private void saveMeasures(InputFile inputFile, PythonVisitorContext visitorContext, PyFile pyFile, String fileContent) { boolean ignoreHeaderComments = new PythonConfiguration(context.fileSystem().encoding()).getIgnoreHeaderComments(); FileMetrics fileMetrics = new FileMetrics(visitorContext, ignoreHeaderComments, pyFile); MetricsVisitor metricsVisitor = fileMetrics.metricsVisitor(); - cpdAnalyzer.pushCpdTokens(inputFile, visitorContext); + cpdAnalyzer.pushCpdTokens(inputFile, pyFile, fileContent); noSonarFilter.noSonarInFile(inputFile, metricsVisitor.getLinesWithNoSonar()); Set linesOfCode = metricsVisitor.getLinesOfCode(); diff --git a/sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java b/sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java index 9741b84bb5..8fd804a82d 100644 --- a/sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java +++ b/sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java @@ -19,56 +19,79 @@ */ package org.sonar.plugins.python.cpd; -import com.sonar.sslr.api.AstNode; -import com.sonar.sslr.api.GenericTokenType; -import com.sonar.sslr.api.Token; -import com.sonar.sslr.api.TokenType; -import java.util.List; +import com.intellij.openapi.editor.Document; +import com.intellij.psi.PsiElement; +import com.intellij.psi.tree.IElementType; +import com.jetbrains.python.PyTokenTypes; +import com.jetbrains.python.lexer.PythonIndentingLexer; +import com.jetbrains.python.psi.PyElementType; +import com.jetbrains.python.psi.PyFile; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import javax.annotation.CheckForNull; import org.sonar.api.batch.fs.InputFile; import org.sonar.api.batch.sensor.SensorContext; import org.sonar.api.batch.sensor.cpd.NewCpdTokens; -import org.sonar.python.PythonVisitorContext; -import org.sonar.python.TokenLocation; -import org.sonar.python.api.PythonTokenType; +import org.sonar.api.utils.log.Logger; +import org.sonar.api.utils.log.Loggers; +import org.sonar.python.frontend.PythonParser; +import org.sonar.python.frontend.PythonTokenLocation; public class PythonCpdAnalyzer { private final SensorContext context; + private static final Set IGNORED_TOKEN_TYPES = new HashSet<>(Arrays.asList( + PyTokenTypes.LINE_BREAK, PyTokenTypes.DEDENT, PyTokenTypes.INDENT, PyTokenTypes.END_OF_LINE_COMMENT, PyTokenTypes.SPACE, PyTokenTypes.STATEMENT_BREAK)); + private static final Logger LOG = Loggers.get(PythonCpdAnalyzer.class); public PythonCpdAnalyzer(SensorContext context) { this.context = context; } - public void pushCpdTokens(InputFile inputFile, PythonVisitorContext visitorContext) { - AstNode root = visitorContext.rootTree(); - if (root != null) { - NewCpdTokens cpdTokens = context.newCpdTokens().onFile(inputFile); - List tokens = root.getTokens(); - for (int i = 0; i < tokens.size(); i++) { - Token token = tokens.get(i); - TokenType currentTokenType = token.getType(); - TokenType nextTokenType = i + 1 < tokens.size() ? tokens.get(i + 1).getType() : GenericTokenType.EOF; - // INDENT/DEDENT could not be completely ignored during CPD see https://docs.python.org/3/reference/lexical_analysis.html#indentation - // Just taking into account DEDENT is enough, but because the DEDENT token has an empty value, it's the - // preceding new line which is added in its place to create a difference - if (isNewLineWithIndentationChange(currentTokenType, nextTokenType) || !isIgnoredType(currentTokenType)) { - TokenLocation location = new TokenLocation(token); - cpdTokens.addToken(location.startLine(), location.startLineOffset(), location.endLine(), location.endLineOffset(), token.getValue()); + public void pushCpdTokens(InputFile inputFile, PyFile pyFile, String fileContent) { + Document document = getDocument(pyFile); + if (document == null) { + LOG.debug("Cannot complete CPD analysis: PSIDocument is null."); + return; + } + PythonIndentingLexer lexer = new PythonIndentingLexer(); + lexer.start(PythonParser.normalizeEol(fileContent)); + NewCpdTokens cpdTokens = context.newCpdTokens().onFile(inputFile); + IElementType prevTokenType = null; + while (lexer.getTokenType() != null) { + IElementType currentTokenType = lexer.getTokenType(); + // INDENT/DEDENT could not be completely ignored during CPD see https://docs.python.org/3/reference/lexical_analysis.html#indentation + // Just taking into account DEDENT is enough, but because the DEDENT token has an empty value, it's the + // following new line which is added in its place to create a difference + if (isNewLineWithIndentationChange(prevTokenType, currentTokenType) || !IGNORED_TOKEN_TYPES.contains(currentTokenType)) { + int tokenEnd = lexer.getTokenEnd(); + String tokenText = lexer.getTokenText(); + if (currentTokenType == PyTokenTypes.LINE_BREAK) { + tokenText = "\n"; + tokenEnd = lexer.getTokenStart() + 1; } + PythonTokenLocation location = new PythonTokenLocation(lexer.getTokenStart(), tokenEnd, document); + cpdTokens.addToken(location.startLine(), location.startLineOffset(), location.endLine(), location.endLineOffset(), tokenText); } - cpdTokens.save(); + prevTokenType = currentTokenType; + lexer.advance(); } + + cpdTokens.save(); } - private static boolean isNewLineWithIndentationChange(TokenType currentTokenType, TokenType nextTokenType) { - return currentTokenType.equals(PythonTokenType.NEWLINE) && nextTokenType.equals(PythonTokenType.DEDENT); + private static boolean isNewLineWithIndentationChange(@CheckForNull IElementType prevTokenType, IElementType currentTokenType) { + return prevTokenType != null && prevTokenType == PyTokenTypes.DEDENT && currentTokenType == PyTokenTypes.LINE_BREAK; } - private static boolean isIgnoredType(TokenType type) { - return type.equals(PythonTokenType.NEWLINE) || - type.equals(PythonTokenType.DEDENT) || - type.equals(PythonTokenType.INDENT) || - type.equals(GenericTokenType.EOF); + @CheckForNull + private static Document getDocument(PyFile pyFile) { + PsiElement root = pyFile.getFirstChild(); + if (root == null) { + return null; + } + return root.getContainingFile().getViewProvider().getDocument(); } } diff --git a/sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java b/sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java index 2d7bcf8a6f..4ec0df0492 100644 --- a/sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java +++ b/sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java @@ -19,8 +19,8 @@ */ package org.sonar.plugins.python.cpd; +import com.jetbrains.python.psi.PyFile; import java.io.File; -import java.nio.charset.StandardCharsets; import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; @@ -32,8 +32,7 @@ import org.sonar.api.batch.sensor.internal.SensorContextTester; import org.sonar.plugins.python.Python; import org.sonar.plugins.python.TestUtils; -import org.sonar.python.PythonVisitorContext; -import org.sonar.python.TestPythonVisitorRunner; +import org.sonar.python.frontend.PythonParser; import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; @@ -46,9 +45,21 @@ public class PythonCpdAnalyzerTest { @Test public void code_chunks_2() { - DefaultInputFile inputFile = inputFile("code_chunks_2.py"); - PythonVisitorContext visitorContext = TestPythonVisitorRunner.createContext(inputFile.path().toFile()); - cpdAnalyzer.pushCpdTokens(inputFile, visitorContext); + File file = new File(BASE_DIR, "code_chunks_2.py"); + + String content = TestUtils.fileContent(file, UTF_8); + DefaultInputFile inputFile = TestInputFileBuilder.create("moduleKey", file.getName()) + .setModuleBaseDir(Paths.get(BASE_DIR)) + .setCharset(UTF_8) + .setType(InputFile.Type.MAIN) + .setLanguage(Python.KEY) + .initMetadata(content) + .build(); + + context.fileSystem().add(inputFile); + + PyFile pyFile = new PythonParser().parse(content); + cpdAnalyzer.pushCpdTokens(inputFile, pyFile, content); List lines = context.cpdTokens("moduleKey:code_chunks_2.py"); assertThat(lines).isNotNull().hasSize(25); @@ -89,19 +100,4 @@ public void code_chunks_2() { "[itemforiteminitems]"); } - private DefaultInputFile inputFile(String fileName) { - File file = new File(BASE_DIR, fileName); - - DefaultInputFile inputFile = TestInputFileBuilder.create("moduleKey", file.getName()) - .setModuleBaseDir(Paths.get(BASE_DIR)) - .setCharset(UTF_8) - .setType(InputFile.Type.MAIN) - .setLanguage(Python.KEY) - .initMetadata(TestUtils.fileContent(file, StandardCharsets.UTF_8)) - .build(); - - context.fileSystem().add(inputFile); - - return inputFile; - } }