# Case Study: Zeeguu/API
- Backend of a web application that supports [free reading in foreign languages](https://zeeguu.org)
- Open source [repository on GH](https://github.com/zeeguu/API/)

# Everything that we did in Data Gathering & Abstraction


In [None]:
# Installing Required Dependencies
import sys
sys.version
#!{sys.executable} -m pip install gitpython
#!{sys.executable} -m pip install pyvis

In [None]:
# In Collab our notebook runs in a temporary mounted file system
# Let's print the name of the folder where our script runs

import os
cwd = os.getcwd()
print(cwd)

In [None]:
# Let's declare a var for the path where we're going to download a repository
# Warning: this must end in /
CODE_ROOT_FOLDER=cwd+"/zeeguu/api/"
print(CODE_ROOT_FOLDER)

In [None]:
from git import Repo
# GitPython is a library that allows us to work easily with git from Python
# https://gitpython.readthedocs.io/en/stable/tutorial.html


# If the file exists, it means we've already downloaded
if not os.path.exists(CODE_ROOT_FOLDER):
  Repo.clone_from("https://github.com/zeeguu/api", CODE_ROOT_FOLDER)
else:
  print("Seems that the repo already exists :)")

# If the output folder does not exist, create one
OUTPUT_FOLDER=cwd+"/output/"
if not os.path.exists(OUTPUT_FOLDER):
  os.makedirs(OUTPUT_FOLDER)


In [None]:
# helper function to get a file path w/o having to always provide the /zeeguu/api/ prefix
def file_path(file_name):
    return CODE_ROOT_FOLDER+file_name

assert (file_path("zeeguu/core/model/user.py") == cwd+"/zeeguu/api/zeeguu/core/model/user.py")


In [None]:
# extracting a module name from a file name
def module_name_from_file_path(full_path):

    # e.g. ../core/model/user.py -> zeeguu.core.model.user

    file_name = full_path[len(CODE_ROOT_FOLDER):]
    #file_name = file_name.replace("/__init__.py","")
    file_name = file_name.replace("/",".")
    file_name = file_name.replace(".py","")
    return file_name

assert 'zeeguu.core.model.user' == module_name_from_file_path(file_path('zeeguu/core/model/user.py'))

In [None]:
# naïve way of extracting imports using regular expressions
import re


# we assume that imports are always at the
# TODO for you: add full support for imports; this is not complete...
def import_from_line(line):

    # regex patterns used
    #   ^  - beginning of line
    #   \S - anything that is not space
    #   +  - at least one occurrence of previous
    #  ( ) - capture group (read more at: https://pynative.com/python-regex-capturing-groups/)
    try:
      y = re.search(r"^from (\S+)", line)
      if not y:
        y = re.search(r"^import (\S+)", line)
      return y.group(1)
    except:
      return None


# extracts all the imported modules from a file
# returns a module of the form zeeguu_core.model.bookmark, e.g.
def imports_from_file(file):

    all_imports = []

    lines = [line for line in open(file)]

    for line in lines:
        imp = import_from_line(line)

        if imp:
            all_imports.append(imp)

    return all_imports

imports_from_file(file_path('zeeguu/core/model/user.py'))

In [None]:
# test
print(imports_from_file(file_path('zeeguu/core/model/bookmark.py')))
print(imports_from_file(file_path('zeeguu/core/model/unique_code.py')))

## Now we extract the dependencies between all the files

To do that we iterate over all the python files with the hel pf the `Path.rglob` function from `pathlib`

And we create a network with the help of the `networkx` package.

In [None]:
import pathlib
from pathlib import Path
import networkx as nx

def dependencies_graph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.Graph()

    for file in files:
        file_path = str(file)
        sub_modules = [sub_mod for sub_mod in str(file).split('/') if sub_mod != '']


        #sub_modules = [sub_mod for sub_mod in module_name_from_file_path(file_path).split('.') if sub_mod != '']

        if len(sub_modules) > 1:
            module_name = sub_modules[-2]
        else:
            module_name = sub_modules[1]

        if module_name not in G.nodes:
            G.add_node(module_name)
            print("new node = ", module_name)

        for each in imports_from_file(file_path):
            each2 = each.split('.')[-1]
            print("new edge = ", each2)
            G.add_edge(module_name, each2)

    return G

## Mathplotlib also has support for drawing networks

We do a simple drawing of all the files and their dependencies in our system

In [None]:
import matplotlib.pyplot as plt

# a function to draw a graph
def draw_graph(G, size, **args):
    plt.figure(figsize=size)
    nx.draw(G, **args)
    plt.show()

In [None]:
G = dependencies_graph(CODE_ROOT_FOLDER)

In [None]:
draw_graph(G, (12,10), with_labels=False, pos=nx.spring_layout(G))

## Abstraction

In [None]:
# Let's define some relevant modules
def relevant_module(module_name):

  if "test" in module_name:
    return False


  if module_name.startswith("zeeguu"):
    return True


  return False

In [None]:
# However, if we think a bit more about it, we realize that a dependency graph
# is a directed graph (e.g. module A depends on m)
# with any kinds of graph either directed (nx.DiGraph) or
# non-directed (nx.Graph)

def dependencies_digraph(code_root_folder, depth=-1):
    if depth != -1:
       depth = depth -1

    files = Path(code_root_folder).rglob("*.py")

    G = nx.DiGraph()

    for file in files:
        file_path = str(file)

        source_module = module_name_from_file_path(file_path)
        if not relevant_module(source_module):
          continue
        
        sms = source_module.split('.')
        if len(sms) > depth:
          src = sms[depth]
        else:
           src = sms[-1]
        if source_module not in G.nodes:
            G.add_node(src)
            print("New node ", src)

        for target_module in imports_from_file(file_path):

            if relevant_module(target_module):
              tms = target_module.split('.')
              if len(tms) > depth:
                dst = tms[depth]
              else:
                dst = tms[-1]
              
              if src != dst:
                if (src, dst) in G.edges():
                  w = G[src][dst]['weight'] # Credit to Claude.AI
                  G.add_edge(src, dst, weight=w+1)
                  print("w = ", w)
                else:
                  G.add_edge(src, dst, weight=1)
                print("New edge ", (src, dst))


    return G


In [None]:
# Looking at the directed graph
DG = dependencies_digraph(CODE_ROOT_FOLDER, 2)

In [None]:
draw_graph(DG, (40,40), with_labels=True, pos=nx.spring_layout(DG))

## Basic Abstraction Using Hierarchical Module Structure & Naming Conventions

- abstracting the imports between the modules along the module hierarchy
- also taking into account naming conventions to filter out external modules

In [None]:
# extracts the parent of depth X
def top_level_package(module_name, depth=1):
    components = module_name.split(".")
    return ".".join(components[:depth])

assert (top_level_package("zeeguu.core.model.util") == "zeeguu")
assert (top_level_package("zeeguu.core.model.util", 2) == "zeeguu.core")

In [None]:
def abstracted_to_top_level(G : nx.DiGraph, depth=1):
    aG = nx.DiGraph()
    for each in G.edges():
        src = top_level_package(each[0], depth).split('.')[-1]
        dst = top_level_package(each[1], depth).split('.')[-1]

        if src != dst:
          if (src, dst) in aG.edges():
            w = aG[src][dst]['weight'] # Credit to Claude.AI
            aG.add_edge(src, dst, weight=w+1)
          else:
             aG.add_edge(src, dst, weight=1)

    return aG

In [None]:
ADG = abstracted_to_top_level(DG, 2)

In [None]:
plt.figure(figsize=(10,10))
nx.draw(ADG, with_labels=True, pos=nx.shell_layout(ADG))
plt.show()

In [None]:
# Inspiration = https://stackoverflow.com/a/70245742
pos = nx.shell_layout(ADG)
fig = plt.figure(figsize=(10,10))
nx.draw_networkx_nodes(ADG, pos)
nx.draw_networkx_labels(ADG, pos)
fig.savefig("output/ADG_edgeless.png")

In [None]:
# extracts the parent of depth X
def verbose_top_level_package(module_name : str, depth=1):
    components = module_name.split(".")
    top_level = ".".join(components[:depth])
    return top_level.split(".")[-1]

def bottom_level_package(module_name : str):
    components = module_name.split(".")
    return components[-1]

def verbose_abstracted_to_top_level(G : nx.DiGraph, depth=1):
    aG = nx.DiGraph()
    for each in G.edges():
        src = verbose_top_level_package(each[0], depth)
        dst = verbose_top_level_package(each[1], depth)

        each_src = each[0]
        each_dst = each[1]
        each_w = G[each_src][each_dst]['weight'] # Credit to Claude.AI

        if src != dst:
          if (src, dst) in aG.edges():
            w = aG[src][dst]['weight'] # Credit to Claude.AI
            aG.add_edge(src, dst, weight=w+each_w)
          else:
            aG.add_edge(src, dst, weight=each_w)
           

    return aG

ADG = verbose_abstracted_to_top_level(DG, 2)

In [None]:
# Traversing through Test
file_types = {}
dir_file_count = {}
dir_char_count = {}
dir_word_count = {}
dir_space_count = {}
dir_line_count = {}
for root, dirs, files in os.walk(CODE_ROOT_FOLDER):
    for file in files:
        filepath = root + '/' + file
        module_named_file = module_name_from_file_path(filepath)
        if root == CODE_ROOT_FOLDER:
            module_named_file = module_named_file[1:]
        
        sub_dirs = [sub_dir for sub_dir in filepath[len(CODE_ROOT_FOLDER):].split('/') if sub_dir != '']

        file_type = '.' + file.split('.')[-1]
        if file_type in file_types:
            file_types[file_type] = 1 + file_types[file_type]
        else:
            file_types[file_type] = 1

        if len(sub_dirs) > 2 and sub_dirs[0] == 'zeeguu' and all(d[0] != '.' for d in sub_dirs) and 'data' not in sub_dirs and 'test_data' not in sub_dirs and file_type in ['.py', '.sh', '.sql']:
            sub_dir = sub_dirs[1]
            if sub_dir not in dir_file_count:
                dir_file_count[sub_dir] = 0
                dir_char_count[sub_dir] = 0
                dir_word_count[sub_dir] = 0
                dir_space_count[sub_dir] = 0
                dir_line_count[sub_dir] = 0


            if file_type in ['.py', '.sh', '.sql']:
                with open(filepath,"r", encoding="utf8", errors='ignore') as f:
                    content = f.read()

                    dir_file_count[sub_dir] = dir_file_count[sub_dir] + 1
                    dir_char_count[sub_dir] = dir_char_count[sub_dir] + len(content)
                    dir_word_count[sub_dir] = dir_word_count[sub_dir] + len(content.split())
                    dir_space_count[sub_dir] = dir_space_count[sub_dir] + content.count(' ')
                    dir_line_count[sub_dir] = dir_line_count[sub_dir] + content.count('\n')
            
            if len(sub_dirs) > 3:
                for dir_i in range(2, len(sub_dirs)-1):
                    sub_dir = sub_dirs[dir_i]
                    if sub_dir not in dir_file_count:
                        dir_file_count[sub_dir] = 0
                        dir_char_count[sub_dir] = 0
                        dir_word_count[sub_dir] = 0
                        dir_space_count[sub_dir] = 0
                        dir_line_count[sub_dir] = 0

                    if file_type in ['.py', '.sh', '.sql']:
                        with open(filepath,"r", encoding="utf8", errors='ignore') as f:
                            content = f.read()

                            dir_file_count[sub_dir] = dir_file_count[sub_dir] + 1
                            dir_char_count[sub_dir] = dir_char_count[sub_dir] + len(content)
                            dir_word_count[sub_dir] = dir_word_count[sub_dir] + len(content.split())
                            dir_space_count[sub_dir] = dir_space_count[sub_dir] + content.count(' ')
                            dir_line_count[sub_dir] = dir_line_count[sub_dir] + content.count('\n')
        elif len(sub_dirs) <= 2 and sub_dirs[0] == 'zeeguu' and all(d[0] != '.' for d in sub_dirs) and 'data' not in sub_dirs and 'test_data' not in sub_dirs and file_type in ['.py', '.sh', '.sql']:
            sub_dir = sub_dirs[0]
            if sub_dir not in dir_file_count:
                dir_file_count[sub_dir] = 0
                dir_char_count[sub_dir] = 0
                dir_word_count[sub_dir] = 0
                dir_space_count[sub_dir] = 0
                dir_line_count[sub_dir] = 0


            if file_type in ['.py', '.sh', '.sql']:
                with open(filepath,"r", encoding="utf8", errors='ignore') as f:
                    content = f.read()

                    dir_file_count[sub_dir] = dir_file_count[sub_dir] + 1
                    dir_char_count[sub_dir] = dir_char_count[sub_dir] + len(content)
                    dir_word_count[sub_dir] = dir_word_count[sub_dir] + len(content.split())
                    dir_space_count[sub_dir] = dir_space_count[sub_dir] + content.count(' ')
                    dir_line_count[sub_dir] = dir_line_count[sub_dir] + content.count('\n')

print(dir_file_count)
print(dir_char_count)
print(dir_word_count)
print(dir_space_count)
print(dir_line_count)

In [None]:
import pandas as pd
import numpy as np



# Inspiration = https://stackoverflow.com/a/70245742
def plot_adigraph(_ADG : nx.DiGraph, _divider=10, plot_name="ADG", pos=None, top_level=True):
    ### Init ###
    if pos is None:
        pos = nx.shell_layout(_ADG)
        #pos = nx.fruchterman_reingold_layout(_ADG)
    
    fig, ax = plt.subplots(figsize=(15, 10))

    if not top_level:
        nodes_to_be_removed = []
        edges_to_be_removed = []
        for node in ADG.nodes():
            if node in _ADG.nodes():
                nodes_to_be_removed.append(node)
                for e in _ADG.edges():
                    if e[0] == node or e[1] == node:
                        edges_to_be_removed.append(e)
        
        for n in nodes_to_be_removed:
            _ADG.remove_node(n)
        for e in edges_to_be_removed:
            _ADG.remove_edge(e[0], e[1])
    

    # Edges
    curved_edges = [edge for edge in _ADG.edges() if reversed(edge) in _ADG.edges()]
    straight_edges = list(set(_ADG.edges()) - set(curved_edges))
    arc_rad = 0.1

    # Edge weights
    edge_weights = nx.get_edge_attributes(_ADG, 'weight')


    straight_edge_widths = [max(0.25, min(2, edge_weights[edge]/_divider)) for edge in straight_edges]
    curved_edge_widths = [max(0.25, min(2, edge_weights[edge]/_divider)) for edge in curved_edges]

    curved_edge_labels = {edge: edge_weights[edge] for edge in curved_edges}
    straight_edge_labels = {edge: edge_weights[edge] for edge in straight_edges}

    ### DRAW ###
    # Nodes
  
    sizes = []
    for dir in _ADG.nodes():
        if dir in dir_file_count:
            sizes.append(dir_file_count[dir]*100)
        else:
            sizes.append(np.nan)
    print(sizes)
    print(dir_file_count)

    avg_char_count = []
    for dir in _ADG.nodes():
        if dir in dir_file_count and dir_file_count[dir] != 0:
            avg_char_count.append((dir_char_count[dir]/dir_file_count[dir]))
        else:
            avg_char_count.append(np.nan)
    print(avg_char_count)

    df = (pd.DataFrame(avg_char_count))
    df['color'] = df.transform(lambda c: c)
    print(df)

    vmax = round(df['color'].max(), -2)
    vmin = 0
    cmap = plt.cm.coolwarm

    nx.draw_networkx_nodes(_ADG, pos, node_size=sizes, node_shape='o', alpha=0.7, linewidths=1,
                           node_color=df['color'], cmap=cmap, vmin=vmin, vmax=vmax)
    


    # Create colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax)
    cbar.set_label('Average Character Count per File')
    
    nx.draw_networkx_labels(_ADG, pos, 
                            verticalalignment='center_baseline', horizontalalignment='center', 
                            alpha=1, ax=ax
                            )

    # Edges
    nx.draw_networkx_edges(_ADG, pos, ax=ax, edgelist=straight_edges, width=straight_edge_widths)
    nx.draw_networkx_edges(_ADG, pos, ax=ax, edgelist=curved_edges, connectionstyle=f'arc3, rad = {arc_rad}', width=curved_edge_widths)

    # Edge labels
    nx.draw_networkx_edge_labels(_ADG, pos, edge_labels=curved_edge_labels, ax=ax,
                                rotate=False, connectionstyle=f'arc3, rad = {arc_rad}',
                                label_pos=0.4)
    nx.draw_networkx_edge_labels(_ADG, pos, edge_labels=straight_edge_labels, ax=ax,
                                rotate=False, label_pos=0.4)
    # Draw
    fig.savefig(f"output/{plot_name}.png", bbox_inches='tight', pad_inches=0)

In [None]:
plot_adigraph(ADG)

In [None]:
SUB_FOLDER=cwd+"/zeeguu/api/zeeguu/core"
newDG = dependencies_digraph(SUB_FOLDER)
newADG = verbose_abstracted_to_top_level(newDG, 3)
plot_adigraph(newADG, plot_name="newADG", pos=nx.shell_layout(newADG), top_level=False)

# Evolution Analysis

In [None]:
#!{sys.executable} -m pip install pydriller==2.6

In [None]:
from pydriller import Repository
REPO_DIR = 'https://github.com/zeeguu/api'

In [None]:
# for PyDriller to work we need to change directory to our local clone of the repo
%cd zeeguu/api

In [None]:
all_commits = list(Repository(REPO_DIR).traverse_commits())

def print_out_commit_details(commits):
  for commit in commits:
      print(commit)
      for each in commit.modified_files:
          print(f"{commit.author.name} {each.change_type} {each.filename}\n -{each.old_path}\n -{each.new_path}")

#print_out_commit_details(all_commits[0:1])

In [None]:
from collections import defaultdict

commit_counts = defaultdict(int)

for commit in all_commits:
    for each in commit.modified_files:
        try:
            commit_counts [each.new_path] += 1
        except:
            pass

# sort by number of commits in decreasing order
sorted(commit_counts.items(), key=lambda x: x[1], reverse=True)[:42]

# discussion: What is ("None", 103) ?

In [None]:
from pydriller import ModificationType

commit_counts = {}

for commit in all_commits:
    for modification in commit.modified_files:

        new_path = modification.new_path
        old_path = modification.old_path

        try:

            if modification.change_type == ModificationType.RENAME:
                commit_counts[new_path]=commit_counts.get(old_path,0)+1
                commit_counts.pop(old_path)

            elif modification.change_type == ModificationType.DELETE:
                commit_counts.pop(old_path, '')

            elif modification.change_type == ModificationType.ADD:
                commit_counts[new_path] = 1

            else: # modification to existing file
                    commit_counts [old_path] += 1
        except Exception as e:
            print("something went wrong with: " + str(modification))
            pass

sorted(commit_counts.items(), key=lambda x:x[1], reverse=True)

In [None]:
%cd ../..

In [None]:


def module_name_from_rel_path(full_path):

    # e.g. ../core/model/user.py -> zeeguu.core.model.user

    #file_name = full_path.replace("/__init__.py","")
    file_name = full_path.replace("/",".")
    file_name = file_name.replace(".py","")
    return file_name


assert ("tools.migrations.teacher_dashboard_migration_1.upgrade" == module_name_from_rel_path("tools/migrations/teacher_dashboard_migration_1/upgrade.py"))

#assert ("zeeguu.api") == module_name_from_rel_path("zeeguu/api/__init__.py")

In [None]:
package_activity = defaultdict(int)

for path, count in commit_counts.items():
    if ".py" in str(path):
        l2_module = top_level_package(module_name_from_rel_path(path), 2)
        package_activity[l2_module] += count

sorted_sizes = sorted(package_activity.items(), key=lambda x: x[1], reverse=True)

print(sorted_sizes)



In [None]:

sizes = []

for n in ADG.nodes():
  sizes.append(package_activity[n])

print(sizes)


In [None]:
plt.figure(figsize=(20,20))
nx.draw_networkx(ADG, with_labels=True, node_size = sizes, node_color='r', pos=nx.kamada_kawai_layout(G))
plt.show()

# For Home: Extract Multiple Evolution Hotspots from Zeeguu

- Extract multiple complementary module views from your case study system
- Ensure that your layouts are readable - limit the number of nodes in a view, use a different layout in networkx, or use a different library than networkx
- Augment each of the previously obtained module views by mapping the above-computed churn metric on the color of a given node