In [None]:
import import_ipynb
import functions
from functions import gini, create_side_by_side_plot, highlight, highlight_no_rating, show_table_statistics, show_table_statistics_no_rating, group_by_components, group_by_ratings, show
from functions import colors, sizeRatings, complexityRatings

In [None]:
import pandas as pd 
import pygal as pg
from string import Template
from IPython.core.display import display, Javascript, HTML
import plotly.graph_objs as go

%load_ext cypher
%config CypherMagic.uri='http://neo4j:neo@localhost:7474/db/data'

group_id=%env PARENT_GROUP_ID
artifact_id=%env PARENT_ARTIFACT_ID

base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""

In [None]:
display(Javascript("""require.config({
    paths: {
        lodash: "/notebooks/vis/lib/lodash.min",  
        d3: "/notebooks/vis/lib/d3.v4.min"
    }
});"""))

In [None]:
display(HTML(filename='chord.css.html'))
display(Javascript(filename='chord.js'))

# Proof of Concept - Analysis of the Biojava Software System

## Introduction

This analysis is based on the metrics used by the Software Improvement Group as defined in
* <a href="https://www.softwareimprovementgroup.com/wp-content/uploads/2021-SIG-TUViT-Evaluation-Criteria-Trusted-Product-Maintainability-Guidance-for-producers.pdf">SIG/TÜViT Evaluation Criteria Trusted Product Maintainability: Guidance for producers Version 13.0</a>

Detailed information about the metrics can be found in
* <a href="https://www.softwareimprovementgroup.com/wp-content/uploads/Building_Maintainable_Software_SIG_Java.compressed.pdf">Building Maintainable Software - Ten Guidelines for Future-Proof Code</a>

This analysis is carried out on three different levels of abstractions, i.e.:

1. Component - Top-Level Maven Modules inside the Project Reactor
2. Module - Java classes contained directly or indirectly via components and their child components, respectively
3. Unit - Methods inside Modules

# Preparation

For easier query writing, all Biojava files are labeled as :Biojava using the configured group id.

In [None]:
files = %cypher \
MATCH (a:Main:Artifact) \
WHERE a.group STARTS WITH "$group_id" \
SET   a:Biojava \
WITH  a \
OPTIONAL MATCH (a)-[:CONTAINS]->(any) \
SET   any:Biojava \
RETURN  count(DISTINCT a) AS Artifacts, count(DISTINCT any) AS Elements
        
files        

## PMD

* PMD is used as a Maven Build-Plugin to calculate metrics on class and method-level for this PoC
* Used metrics are:
  * NCSS Line Count (Non Commenting Source Statements)
  * Cyclomatic Complexity

In [None]:
%%cypher
// Link PMD Violations on class level to the class nodes
MATCH (v:Pmd:Violation),
      (t:Biojava:Type:Java{fqn: v.package + "." + v.className})
WHERE NOT exists(v.method)    
MERGE (t)-[:HAS_VIOLATION]->(v)

In [None]:
%%cypher
// Sanity check to see that all class violations were mapped to class nodes
MATCH  (v:Pmd:Violation)
WHERE  NOT exists(v.method) AND NOT (:Type)-[:HAS_VIOLATION]->(v)
RETURN v.package AS Package, v.className AS Class, v.message AS Violation

In [None]:
%%cypher
// Link PMD Violations on method level to method nodes
MATCH  (v:Pmd:Violation),
       (t:Biojava:Type{fqn: v.package + "." + v.className}),
       (t)-[:DECLARES]->(m:Method)
WITH   v, t, m, size(split(t.name, "$")) AS l
WITH   v, t, m, split(t.name, "$")[l - 1] AS TypeName
WITH   v, t, m, replace(m.name, "<init>", TypeName) AS MethodName
WHERE  v.method = MethodName
OPTIONAL MATCH (m)-[:HAS]->(p:Parameter)-[:OF_TYPE]->(pType:Type)
WITH   v, t, m, p, pType, size(split(pType.name, "$")) AS l
WITH   v, t, m, p, split(pType.name,"$")[l - 1] AS ParamType
ORDER BY t, m, p.index ASC, ParamType
WITH   v, t, m, collect(ParamType) AS params
WITH   v, t, m, REDUCE(mergedParams = "",param IN params | mergedParams + CASE WHEN mergedParams = '' THEN '' ELSE ', ' END + param) AS mergedParams
WITH   v, t, m, "(" + mergedParams + ")'" AS ParamList
WITH   v, t, m, ParamList, replace(v.message, "...", "") AS Message
WHERE  Message CONTAINS ParamList
MERGE  (m)-[:HAS_VIOLATION]->(v)

In [None]:
%%cypher
// Link PMD Violations on method level to method nodes where there is only one candidate based on the number of parameters
MATCH  (v:Pmd:Violation),
       (t:Biojava:Type{fqn: v.package + "." + v.className}),
       (t)-[:DECLARES]->(m:Method)
WITH   v, t, m, size(split(t.name, "$")) AS l
WITH   v, t, m, split(t.name, "$")[l - 1] AS TypeName
WITH   v, t, m, replace(m.name, "<init>", TypeName) AS MethodName
WHERE  v.method = MethodName
OPTIONAL MATCH (m)-[:HAS]->(p:Parameter)-[:OF_TYPE]->(pType:Type)
WITH   v, t, m, count(p) AS ParameterCount
WITH   v, t, m, ParameterCount
WITH   v, t, m, ParameterCount, split(split(v.message, "(")[1], ")'")[0] AS MessageParameters
WITH   v, t, m, ParameterCount, CASE WHEN size(MessageParameters) = 0 THEN 0 ELSE size(split(MessageParameters, ",")) END AS MessageParameterCount
WHERE  ParameterCount = MessageParameterCount
WITH   v, t, collect(m) AS Candidates
WHERE  size(Candidates) = 1
UNWIND Candidates AS m
MERGE  (m)-[:HAS_VIOLATION]->(v)

In [None]:
%%cypher
// Sanity check to see that all method violations were mapped to method nodes
MATCH  (v:Pmd:Violation)
WHERE  exists(v.method) AND NOT (:Method)-[:HAS_VIOLATION]->(v)
RETURN v.package AS Package, v.className AS Class, v.method AS Method, v.message AS Violation

In [None]:
%%cypher
// Copy the PMD cyclomatic complexity of the class to the type node
MATCH  (j:Java)-[:HAS_VIOLATION]->(v:Pmd:Violation{rule: "CyclomaticComplexity"})
WITH   split(v.message, " complexity of ") AS parts, j
WITH   split(parts[1], " ")[0] AS complexity, j
SET    j.pmdCyclomaticComplexity = toInteger(complexity)

In [None]:
%%cypher
// Copy the PMD ncss line count of the class to the type node
MATCH (j:Java)-[:HAS_VIOLATION]->(v:Pmd:Violation{rule: "NcssCount"})
WITH   split(v.message, " line count of ") AS parts, j
WITH   split(parts[1], " ")[0] AS ncss, j
SET    j.ncssLineCount = toInteger(ncss)

## SIG

* The Software Improvement Group defines top-level maven modules as components
  * The same notion will be used in this PoC

In [None]:
%%cypher
// Create a :SIG:Component node per top-level Maven-module
MATCH (m:Maven:Project{groupId: $group_id, artifactId: $artifact_id}),
      (m)-[:HAS_MODULE]->(child:Maven:Project)  
MERGE (c:Biojava:SIG:Component{name: child.name})
RETURN c.name AS ComponentName

In [None]:
%%cypher
// Map all types contained directly in a component to the component
MATCH (c:Biojava:SIG:Component),
      (m:Maven:Project{name: c.name})-[:CREATES]->(:Biojava:Main:Artifact)-[:CONTAINS]->(t:Biojava:Type:Java)
MERGE (c)-[:CONTAINS]->(t)

In [None]:
%%cypher
// Map all types contained as child modules in a component to the component
MATCH (c:Biojava:SIG:Component),
      (m:Maven:Project{name: c.name})-[:HAS_MODULE*]->()-[:CREATES]->(:Biojava:Main:Artifact)-[:CONTAINS]->(t:Biojava:Type:Java)
MERGE (c)-[:CONTAINS]->(t)

In [None]:
%%cypher
// Aggregate DEPENDS_ON relations between modules to components
MATCH (c1:Biojava:SIG:Component)-[:CONTAINS]->(t1:Biojava:Type:Java),
      (c2:Biojava:SIG:Component)-[:CONTAINS]->(t2:Biojava:Type:Java),
      (t1)-[d:DEPENDS_ON]->(t2)
WITH  c1, c2, sum(d.weight) AS weight
MERGE (c1)-[d:DEPENDS_ON{weight: weight}]->(c2)

* Following components and dependencies between them could be identified

In [None]:
%%cypher
// Update the DEPENDS_ON relations with the number of method invoctions between components
MATCH (c1:Biojava:SIG:Component)-[:CONTAINS]->(t1:Biojava:Type:Java)-[:DECLARES]->(m1:Method),
      (c2:Biojava:SIG:Component)-[:CONTAINS]->(t2:Biojava:Type:Java)-[:DECLARES]->(m2:Method),
      (m1)-[i:INVOKES]->(m2)
WITH c1, c2, count(i) AS methodInvocations
MATCH (c1)-[d:DEPENDS_ON]->(c2)
SET d.methodInvocations = methodInvocations
RETURN c1.name AS SourceComponent, c2.name AS TargetComponent, d.weight AS Weight, d.methodInvocations AS MethodInvocations 
ORDER BY SourceComponent, TargetComponent

# System Overview

In [None]:
componentDependencies = %cypher \
MATCH (c1:SIG:Component)-[d:DEPENDS_ON]->(c2:SIG:Component) \
WHERE NOT c1:Spring AND NOT c2:Spring \
RETURN c1.name AS Source, c2.name AS Target, d.methodInvocations AS X_Count

In [None]:
componentDependenciesCsv = '\"' + componentDependencies.get_dataframe().to_csv(index = False).replace("\r\n","\n").replace("\n","\\n") + '\"'

display(HTML("<div id='tooltip'/>"))
display(Javascript("""
(function(element){
    require(['chord'], function(chord) {
        chord(element.get(0), %s)
    });
})(element);
""" % componentDependenciesCsv))

# Metrics

## #1 Volume

### Goal
Keep Your Codebase Small

### Area
System

### Recommendation
Limit the Size of a Java-System to 35 man years (324.000 LoC)

### Calculation
The NCSS Line Count (Non Commenting Source Statements) calculated by PMD is used to calculate the total system size.

### Result

In [None]:
volume = %cypher \
MATCH  (t:Biojava:Type:Java) \
RETURN sum(t.ncssLineCount) AS Java_LoC

volume

## #2  Duplication

### Goal
Write Code Once

### Area
Method

### Recommendation
Limit the duplication of code by extracting common functionality and reuse.

### Calculation
Calculated by SonarQube, thus not covered in this PoC.

## #3 Unit Size

### Goal
Write Short Units of Code

### Area
Method

### Recommendation
Limit the length of code units to 15 lines.

In [None]:
data = {
    'Size': ['<= 15 LoC', '> 15 LoC', '> 30 LoC', '> 60 LoC'],
    'Rating': ['Small', 'Medium', 'Large', 'ExtraLarge'], 
    'Rule': ['at least', 'at most', 'at most', 'at most'], 
    'Threshold': [58.7, 42.3, 18.5, 5.4]
}  
  
# Create DataFrame  
unitSizeThreshold = pd.DataFrame(data)  
unitSizeThreshold

### Calculation
The NCSS Line Count (Non Commenting Source Statements) on method level calculated by PMD is used to rate the single methods.
Statistics are shown on method-level and aggregated and shown per component.

### Result

In [None]:
unitSizeTable = %cypher \
MATCH (c:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java)-[:DECLARES]->(m:Method) \
WHERE exists(m.ncssLineCount) \
RETURN CASE \
  WHEN m.ncssLineCount <= 15 THEN "Small" \
  WHEN m.ncssLineCount > 15 AND m.ncssLineCount <= 30 THEN "Medium" \
  WHEN m.ncssLineCount > 30 AND m.ncssLineCount <= 60 THEN "Large" \
  ELSE "ExtraLarge" \
END AS Rating, c.name AS Component,  m.ncssLineCount AS LineCount, t.fqn AS Type, m.signature AS Method, 1 AS MethodCount \
ORDER BY LineCount DESC

unitLengthDistribution = show(unitSizeTable.get_dataframe(), sizeRatings, "MethodCount")    

Visualization of the Unit Size metrics:

* The color visualizes the size of methods. On a package level, it shows where the largest methods are located.
* The size of the elements is determined by the sum of the sizes of the (recursively) contained methods

In [None]:
sizeTree = %cypher \
MATCH (:Main:Artifact)-[:CONTAINS]->(e:Biojava:Java) \
WHERE (e:Type OR e:Package) \
       AND NOT ()-[:DECLARES]->(e) \
       AND e.fqn STARTS WITH "$group_id" \
OPTIONAL MATCH (e)-[:DECLARES]->(m:Method) \
WHERE exists(m.ncssLineCount) \
WITH e, max(m.ncssLineCount) AS complexity, sum(m.ncssLineCount) AS size \
OPTIONAL MATCH (parent:Package)-[:CONTAINS]->(e) \
WITH e, parent, complexity, size \
OPTIONAL MATCH (e)-[:CONTAINS*]->(t:Type)-[:DECLARES]->(m:Method) \
WHERE e:Package AND exists(m.ncssLineCount) \
WITH e, parent, t, size, complexity, max(m.ncssLineCount) AS typeComplexity, sum(m.ncssLineCount) AS typeLength \
WITH e.fqn AS Element, parent.fqn AS Parent, CASE e:Package WHEN true THEN sum(typeLength) ELSE size END AS size, CASE e:Package WHEN true THEN max(typeComplexity) ELSE complexity END AS complexity \
RETURN DISTINCT Element, Parent, size as Size, complexity as Color \
ORDER BY complexity DESC
            
import plotly.express as px

df = sizeTree.get_dataframe()
fig = px.treemap(sizeTree.get_dataframe(), names = 'Element', parents = 'Parent', values = 'Size', color= 'Color')
fig.show()     

## #4 Unit Complexity

### Goal
Write Simple Units of Code

### Area
Method

### Recommendation
Limit the number of branch points inside a method to 4.

In [None]:
data = {
    'Complexity': ['<= 5', '> 5', '> 10', '> 25'],
    'Rating': ['Low', 'Medium', 'High', 'VeryHigh'], 
    'Rule': ['at least', 'at most', 'at most', 'at most'], 
    'Threshold': [81.6, 19.4, 6.6, 0.8]
}  
  
# Create DataFrame  
unitComplexityThreshold = pd.DataFrame(data)  
unitComplexityThreshold

### Calculation
The McCabe Complexity (1 + Cyclomatic Complexity of Implementation (=Branch Points, ...) is used for calculation.
For that, the Cyclomatic Complexity calculated by PMD on method-level is used. 

Additionally, the NCSS Line Count (Non Commenting Source Statements) calculated by PMD on method-level is used to compute the line percentage per rating.

### Results

In [None]:
unitComplexityTable = %cypher \
MATCH (c:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java)-[:DECLARES]->(m:Method) \
WHERE exists(m.pmdCyclomaticComplexity) AND exists(m.ncssLineCount) \
RETURN CASE \
  WHEN m.pmdCyclomaticComplexity <= 5 THEN "Low" \
  WHEN m.pmdCyclomaticComplexity > 5 AND m.pmdCyclomaticComplexity <= 10 THEN "Medium" \
  WHEN m.pmdCyclomaticComplexity > 10 AND m.pmdCyclomaticComplexity <= 25 THEN "High" \
  ELSE "VeryHigh" \
END AS Rating, c.name AS Component, m.pmdCyclomaticComplexity AS Complexity, t.fqn AS Type, m.signature AS Method, 1 AS MethodCount, m.ncssLineCount AS LineCount \
ORDER BY Complexity DESC

unitComplexityDistribution = show(unitComplexityTable.get_dataframe(), complexityRatings, "MethodCount")    

Visualization of the Unit Complexity metrics:

* The color visualizes the complexity of methods. On a package level, it shows where the most-complex methods are located.
* The size of the elements is determined by the sum of the sizes (LoC) of the (recursively) contained methods

In [None]:
complexityTree = %cypher \
MATCH (:Main:Artifact)-[:CONTAINS]->(e:Biojava:Java) \
WHERE (e:Type OR e:Package) \
       AND NOT ()-[:DECLARES]->(e) \
       AND e.fqn STARTS WITH "$group_id" \
OPTIONAL MATCH (e)-[:DECLARES]->(m:Method) \
WHERE exists(m.pmdCyclomaticComplexity) \
WITH e, sum(m.pmdCyclomaticComplexity) AS complexity, sum(m.ncssLineCount) AS size \
OPTIONAL MATCH (parent:Package)-[:CONTAINS]->(e) \
WITH e, parent, complexity, size \
OPTIONAL MATCH (e)-[:CONTAINS*]->(t:Type)-[:DECLARES]->(m:Method) \
WHERE e:Package AND exists(m.pmdCyclomaticComplexity) \
WITH e, parent, t, size, complexity, sum(m.pmdCyclomaticComplexity) AS typeComplexity, sum(m.ncssLineCount) AS typeLength \
WITH e.fqn AS Element, parent.fqn AS Parent, CASE e:Package WHEN true THEN sum(typeLength) ELSE size END AS size, CASE e:Package WHEN true THEN max(typeComplexity) ELSE complexity END AS complexity \
RETURN DISTINCT Element, Parent, size as Size, complexity as Color \
ORDER BY complexity DESC
            
import plotly.express as px

df = complexityTree.get_dataframe()
fig = px.treemap(complexityTree.get_dataframe(), names = 'Element', parents = 'Parent', values = 'Size', color= 'Color')
fig.show()            

## #5 Unit Interfacing

### Goal
Keep Unit Interfaces Small

### Area
Method

### Recommendation
Limit the number of parameters to at most 4

In [None]:
data = {
    'Size': ['<= 2 Parameters', '> 2 Parameters', '> 4 Parameters', '> 7 Parameters'],
    'Rating': ['Small', 'Medium', 'Large', 'ExtraLarge'], 
    'Rule': ['at least', 'at most', 'at most', 'at most'], 
    'Threshold': [85.9, 14.1, 2.8, 0.7]
}  
  
# Create DataFrame  
unitInterfacingThreshold = pd.DataFrame(data)
unitInterfacingThreshold

### Calculation
The number of parameters per method is summed.

Additionally, the NCSS Line Count (Non Commenting Source Statements) calculated by PMD on method-level is used to compute the line percentage per rating.

### Results

In [None]:
unitInterfacingTable = %cypher \
MATCH (c:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java)-[:DECLARES]->(m:Method), \
      (m)-[:HAS]->(p:Parameter) \
WHERE exists(m.ncssLineCount) AND exists(m.pmdCyclomaticComplexity) \
WITH c, t, m, count(p) AS Parameters \
RETURN Parameters, CASE \
  WHEN Parameters <= 2 THEN "Small" \
  WHEN Parameters > 2 AND Parameters <= 4 THEN "Medium" \
  WHEN Parameters > 4 AND Parameters <= 7 THEN "Large" \
  ELSE "ExtraLarge" \
END AS Rating, c.name AS Component, t.fqn AS Type, m.signature AS Method, m.ncssLineCount AS LineCount, 1 AS MethodCount \
ORDER BY Parameters DESC

unitInterfacingDistribution = show(unitInterfacingTable.get_dataframe(), sizeRatings, "MethodCount")

## #6 Module Coupling

### Goal
Seperate Concerns in Modules

### Area
Class

### Recommendation
Avoid large modules in order to achieve loose coupling between them.

In [None]:
data = {
    "Fan-In": ["<= 10", "> 10", "> 20", "> 50"],
    'Rating': ['Low', 'Medium', 'High', 'VeryHigh'], 
    'Rule': ['at least', 'at most', 'at most', 'at most'], 
    'Threshold': [87.5, 12.5, 7.1, 2.4]
}  
  
# Create DataFrame  
moduleCouplingThreshold = pd.DataFrame(data)  
moduleCouplingThreshold

### Results

In [None]:
moduleCouplingTable = %cypher \
MATCH (t1:Biojava:Type:Java)-[:DECLARES]->(m1:Method)-[i:INVOKES]->(m2:Method)<-[:DECLARES]-(t2:Biojava:Type:Java), \
      (c:Component)-[:CONTAINS]->(t2) \
WHERE exists(t2.ncssLineCount) \
WITH c, t2, count(i) AS FanIn \
RETURN CASE \
  WHEN FanIn <= 10 THEN "Low" \
  WHEN FanIn > 10 AND FanIn <= 20 THEN "Medium" \
  WHEN FanIn > 20 AND FanIn <= 50 THEN "High" \
  ELSE "VeryHigh" \
END AS Rating, c.name AS Component, FanIn, t2.fqn AS Type, t2.ncssLineCount AS LineCount, 1 AS TypeCount \
ORDER BY FanIn DESC

moduleCouplingDistribution = show(moduleCouplingTable.get_dataframe(), complexityRatings, "TypeCount")

## #7 Component Balance

### Goal
Keep Architecture Components Balanced

### Area
Components

### Recommendation
Balance the number and relative size of toplevel components.
The Gini-coefficient describing the size balance should be <= 0.76.

### Calculation
The number of Java classes is summed per component.
Afterward, the Gini-coefficient is calculated on this basis.

### Results

In [None]:
sizes = %cypher \
MATCH (c:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java) \
WITH c, sum(t.ncssLineCount) AS Size \
RETURN DISTINCT c.name AS Component, Size \
ORDER BY Size DESC
    
sizes    

In [None]:
df = sizes.get_dataframe()["Size"]    
giniCoefficient = gini(df.to_numpy())

display("Gini Coefficient is " + str(giniCoefficient))

## #8 Component Independence

### Goal
Couple Architecture Components Loosely

### Area
Components

### Recommendation
Achieve loose coupling between top-level components. The percentage of code resising in modules (classes) with incoming cross-component dependencies should be below 8.7%

### Calculation
Types are categorized into hidden (component-private) and interface (component-public) code.
For each category, the percentage of code is calculated using the NCSS Line Count (Non Commenting Source Statements) calculated by PMD.

### Result

In [None]:
hiddenCodeDistribution = %cypher \
MATCH (c1:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java) \
WITH c1, t \
OPTIONAL MATCH \
      (c2:SIG:Component)-[:CONTAINS]->(d:Biojava:Type:Java), \
      (d)-[:DEPENDS_ON]->(t) \
WITH c1, t, collect(DISTINCT c2) AS DependentModules \
WHERE size(DependentModules) = 0 OR (size(DependentModules) = 1 AND c1 IN DependentModules) \
WITH c1, count(t) AS CountHiddenTypes, sum(t.ncssLineCount) AS CountHiddenLines \
MATCH (c1)-[:CONTAINS]->(t1:Biojava:Type:Java) \
RETURN c1.name AS Component, count(DISTINCT t1) AS TypeCount, sum(t1.ncssLineCount) AS LineCount, CountHiddenTypes, CountHiddenLines


interfaceCodeDistribution = %cypher \
MATCH (c1:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java) \
WITH c1, t \
OPTIONAL MATCH \
      (c1)-[:CONTAINS]->(t:Biojava:Type:Java), \
      (c2:SIG:Component)-[:CONTAINS]->(d:Biojava:Type:Java), \
      (d)-[:DEPENDS_ON]->(t) \
WITH c1, t, collect(DISTINCT c2) AS DependentModules \
WHERE size(DependentModules) > 1 OR (size(DependentModules) = 1 AND NOT c1 IN DependentModules) \
WITH c1, count(t) AS CountInterfaceTypes, sum(t.ncssLineCount) AS CountInterfaceLines \
MATCH (c1)-[:CONTAINS]->(t1:Biojava:Type:Java) \
RETURN c1.name AS Component, count(DISTINCT t1) AS TypeCount, CountInterfaceTypes, CountInterfaceLines

interfaceAndHiddenCode = pd.merge(hiddenCodeDistribution.get_dataframe(), interfaceCodeDistribution.get_dataframe(), how='left').fillna(0)

display(interfaceAndHiddenCode)

trace1 = go.Bar(x=interfaceAndHiddenCode["Component"], y=interfaceAndHiddenCode["CountHiddenTypes"], name="Hidden Types", marker_color=colors["Small"])
trace2 = go.Bar(x=interfaceAndHiddenCode["Component"], y=interfaceAndHiddenCode["CountInterfaceTypes"], name="Interface Types", marker_color=colors["Medium"])
trace3 = go.Bar(x=interfaceAndHiddenCode["Component"], y=interfaceAndHiddenCode["CountHiddenLines"], name="Hidden Lines", marker_color=colors["Small"])
trace4 = go.Bar(x=interfaceAndHiddenCode["Component"], y=interfaceAndHiddenCode["CountInterfaceLines"], name="Interface Lines", marker_color=colors["Medium"])

fig1 = go.Figure(data=[trace1, trace2])
fig1.update_layout(title="Hidden and Interface Types per Component", xaxis_title="Component", yaxis_title="Number of Types", barmode="stack")
fig1.show()

fig2 = go.Figure(data=[trace3, trace4])
fig2.update_layout(title="Hidden and Interface Lines per Component", xaxis_title="Component", yaxis_title="Number of Types", barmode="stack")
fig2.show()

#### Stability Metrics

* Calculation of the complexity metrics as defined by Robert C. Martin on the level of components
  * Efferent Coupling (Ce),
    * number of outgoing dependencies (Fan-Out) of a component,
  * Afferent Coupling (Ca),
    * number of incoming dependencies (Fan-In) of a component,
  * Instability (I) = Ce / (Ce + Ca),
    * stability of a component against changes to other components (smaller = more stable),
    * but: the more stable, the more difficult a component is to change due to many dependent components,
  * Abstractness (A) = Na / Nc,
    * dercentage of abstract types in the component,
  * Distance (D) = |A + I - 1|,
    * distance to the optimal relation between abstractness and instability (larger = worse)
    
    
* Zone of Pain,
  * stable (small I) and concrete (small A),
    * Changes to these components lead to many changes in dependent components,
  * Zone of Uselessnes,
    * instable (large I ) und abstract (large A),
    * Provided components have no usage    

##### Stability Metrics on Component-level

In [None]:
module_instability = %cypher \
MATCH   (c1:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java) \
WITH    DISTINCT c1 \
OPTIONAL MATCH    (c1)-[d:DEPENDS_ON]->(c2:SIG:Component) \
WHERE   c1 <> c2 \
WITH    c1, sum(d.methodInvocations) AS EfferentCoupling \
OPTIONAL MATCH    (c1)<-[d:DEPENDS_ON]-(c2:SIG:Component) \
WHERE   c1 <> c2 \
WITH    c1, EfferentCoupling, sum(d.methodInvocations) AS AfferentCoupling \
WHERE   EfferentCoupling + AfferentCoupling > 0 \
WITH    c1, \
        toFloat(EfferentCoupling) / (EfferentCoupling + AfferentCoupling) AS Instability, EfferentCoupling, AfferentCoupling \
RETURN  c1.name AS Component, Instability, EfferentCoupling, AfferentCoupling

module_abstractness = %cypher \
MATCH    (c:SIG:Component)-[:CONTAINS]->(t:Biojava:Type:Java) \
WITH     c, \
         count(t) AS Total \
OPTIONAL MATCH (c)-[:CONTAINS]->(t:Biojava:Type:Java) \
WHERE    t:Interface OR exists(t.abstract) \
WITH     c, \
         toFloat(count(t)) / Total AS Abstractness \
RETURN   c.name AS Component, Abstractness \
ORDER BY Abstractness DESC 
    
# Calculation of the module distance
module_distance = pd.merge(module_instability.get_dataframe(), module_abstractness.get_dataframe(), how='outer', on = ['Component'])
module_distance = module_distance.fillna(0)

module_distance_doc = []
for _id in module_distance.T:
    data = module_distance.T[_id]
    values = {'value': (data.Abstractness, data.Instability), 'label': data.Component}
    module_distance_doc.append(values)    
    
xy_module_chart = pg.XY(stroke=False, x_title='Abstractness', y_title='Instability')
xy_module_chart.title = 'Robert C. Martin Distance'
xy_module_chart.add('Abstractness to Instability', module_distance_doc)
xy_module_chart.add('Optimum', [(0, 1), (1, 0)], stroke=True)
xy_module_chart.add('Zone of Pain', [(0, 0.3), (0.3, 0)], stroke=True)
xy_module_chart.add('Zone of Uselesness', [(1, 0.7), (0.7, 1)], stroke=True)
display(HTML(base_html.format(rendered_chart=xy_module_chart.render(is_unicode=True))))

##### Stability Metrics on Module-level

In [None]:
# Calculation of the module instability
module_instability_artifact = %cypher \
MATCH    (m1:Maven:Main:Artifact) \
MATCH    (m1)-[:CONTAINS]->(t:Biojava:Type:Java)-[:DEPENDS_ON]->(d:Biojava:Type:Java)<-[:CONTAINS]-(m2:Maven:Main:Artifact) \
WHERE    m1 <> m2 \
WITH     m1,\
         count(d) AS EfferentCoupling \
MATCH    (m1)-[:CONTAINS]->(t:Biojava:Type:Java)<-[:DEPENDS_ON]-(d:Biojava:Type:Java)<-[:CONTAINS]-(m2:Maven:Main:Artifact) \
WHERE    m1 <> m2 \
WITH     m1, \
         EfferentCoupling, count(d) AS AfferentCoupling \
WHERE    EfferentCoupling + AfferentCoupling > 0 \
WITH     m1, \
         toFloat(EfferentCoupling) / (EfferentCoupling + AfferentCoupling) AS Instability \
RETURN   m1.name AS Name, Instability \
ORDER BY Instability DESC

module_abstractness_artifact = %cypher \
MATCH    (m:Maven:Main:Artifact)-[:CONTAINS]->(t:Biojava:Type:Java) \
WITH     m, \
         count(t) AS Total \
OPTIONAL MATCH (m)-[:CONTAINS]->(t:Biojava:Type:Java) \
WHERE    t:Interface OR exists(t.abstract) \
WITH     m, \
         toFloat(count(t)) / Total AS Abstractness \
RETURN   m.name AS Name, Abstractness \
ORDER BY Abstractness DESC 
    
# Calculation of the module distance
module_distance_artifact = pd.merge(module_instability_artifact.get_dataframe(), module_abstractness_artifact.get_dataframe(), how='outer', on = ['Name'])
module_distance_artifact = module_distance_artifact.fillna(0)

module_distance_doc = []
for _id in module_distance_artifact.T:
    data = module_distance_artifact.T[_id]
    values = {'value': (data.Abstractness, data.Instability), 'label': data.Name}
    module_distance_doc.append(values)    

xy_module_chart = pg.XY(stroke=False, x_title='Abstractness', y_title='Instability')
xy_module_chart.title = 'Robert C. Martin Distance'
xy_module_chart.add('Abstractness to Instability', module_distance_doc)
xy_module_chart.add('Optimum', [(0, 1), (1, 0)], stroke=True)
xy_module_chart.add('Zone of Pain', [(0, 0.3), (0.3, 0)], stroke=True)
xy_module_chart.add('Zone of Uselesness', [(1, 0.7), (0.7, 1)], stroke=True)
display(HTML(base_html.format(rendered_chart=xy_module_chart.render(is_unicode=True))))

## #9 Component Entanglement

### Goal
Couple Architecture Components Loosely

### Area
Components

### Recommendation
Achieve loose coupling between top-level components. The component entanglement should be below 0.14.

### Calculation
Component Entanglement is calculated by multiplying the communication density ([0,1]) with the communication violation ratio ([0..1]).

Communication Density is calculated by dividing the number of communication lines between components
by the number of possible communication lines.

Communication Violation Ratio is calculated by dividing the number of communication lines affected by cyclic dependencies, indirect cyclic dependencies, or transitive dependencies by the number of communication lines.

Cyclic dependencies occur when component A has a dependency on component B and component B has dependency on component A.

Indirect cyclic dependencies occur when components do not have direct cyclic dependencies, but indirectly
communicate so that every component is dependent on every other component.

Transitive dependencies occur when a component has both direct and indirect dependencies on another
component.

### Result

In [None]:
numberComponents = %cypher \
MATCH (c:SIG:Component) \
RETURN count(c)

numberComponents = numberComponents[0][0]
numberComponents

communicationLines = %cypher \
MATCH (c1:SIG:Component)-[d:DEPENDS_ON]->(c2:SIG:Component) \
WHERE c1 <> c2 \
RETURN count(d)

communicationLines = communicationLines[0][0]

communicationDensity = communicationLines / (numberComponents * (numberComponents -1))

cyclicDependencies = %cypher \
MATCH (c1:SIG:Component)-[d:DEPENDS_ON]->(c2:SIG:Component), \
      (c2)-[:DEPENDS_ON]->(c1) \
WHERE c1 <> c2 \
RETURN count(DISTINCT d)

cyclicDependencies = cyclicDependencies[0][0]

indirectCyclidDependencies = %cypher \
MATCH (c1:SIG:Component)-[d:DEPENDS_ON]->(c2:SIG:Component), \
      shortestPath((c2)-[:DEPENDS_ON]->(c1)) \
WHERE c1 <> c2 \
RETURN count(DISTINCT d)

indirectCyclidDependencies = indirectCyclidDependencies[0][0]

transitiveDependencies = %cypher \
MATCH (c1:SIG:Component)-[d:DEPENDS_ON]->(c2:SIG:Component), \
      (c1)-[:DEPENDS_ON*2..]->(c2) \
RETURN count(DISTINCT d)

transitiveDependencies = transitiveDependencies[0][0]

violationRatio = (cyclicDependencies + indirectCyclidDependencies + transitiveDependencies) / communicationLines

componentEntanglement = communicationDensity * violationRatio

In [None]:
print("Communication Lines: " + str(communicationLines))
print("Communication Density: " + str(communicationDensity))
print("Cyclic Dependencies: " + str(cyclicDependencies))
print("Indirect Cyclic Dependencies: " + str(indirectCyclidDependencies))
print("Transitive Dependencies: " + str(transitiveDependencies))
print("Component Entanglement: " + str(componentEntanglement))

# Summary

## #1 Volume

* overall lines of code <= 324000

### Summary

In [None]:
print("Fulfilled: " + str(volume.get_dataframe()['Java_LoC'][0] < 324000) + " (" + str(volume.get_dataframe()['Java_LoC'][0]) + ")") 

## #2 Duplication

* Write Code Once
* Measured via SonarQube
* <= 4.8%

## #3 Unit size

* Write Short Units of Code

### Summary

In [None]:
df = show_table_statistics(unitLengthDistribution)
df = pd.merge(unitSizeThreshold, df, how='outer', on = ['Rating'])
df.style.apply(highlight, df=df, thresholdDf=unitSizeThreshold, axis=1, column=["LinePercentage"])

## #4 Unit Complexity

* Write Simple Units of Code

### Summary

In [None]:
df = show_table_statistics(unitComplexityDistribution)
df = pd.merge(unitComplexityThreshold, df, how='outer', on= ['Rating'])
df.style.apply(highlight, df=df, thresholdDf=unitComplexityThreshold, axis=1, column=["LinePercentage"])

## #5 Unit interfacing

* Keep Unit Interfaces Small

### Summary

In [None]:
df = show_table_statistics(unitInterfacingDistribution)
df = pd.merge(unitInterfacingThreshold, df, how='outer', on= ['Rating'])
df.style.apply(highlight, df=df, thresholdDf=unitInterfacingThreshold, axis=1, column=["LinePercentage"])

## #6 Module coupling

* Seperate Concerns in Modules

SIG/TÜViT Evaluation Criteria Trusted Product Maintainability: Guidance for producers (Version 13.0)

4-star threshold:

In [None]:
df = show_table_statistics(moduleCouplingDistribution)
df = pd.merge(moduleCouplingThreshold, df, how='outer', on= ['Rating'])
df.style.apply(highlight, df=df, thresholdDf=moduleCouplingThreshold, axis=1, column=["LinePercentage"])

## #7 Component balance

SIG/TÜViT Evaluation Criteria Trusted Product Maintainability: Guidance for producers (Version 13.0)

### Summary

In [None]:
print("Fulfilled: " + str(giniCoefficient < 0.76) + " (" + str(giniCoefficient) + ")")

## #8 Component independence

* Component Independence
  * Code residing in incoming cross-component dependencies <= 8.7%

In [None]:
df = show_table_statistics_no_rating(interfaceAndHiddenCode)
if not 'Threshold' in df.columns:
    df.insert(0, 'Threshold', 8.7)
    
df.style.apply(highlight_no_rating, df=df, threshold=8.7, axis=1, column=["LinePercentage"])

## #9 Component entanglement

In [None]:
print("Fulfilled: " + str(componentEntanglement < 0.14) + " (" + str(componentEntanglement) + ")")