In [1]:
import os
from os.path import join as pjoin
import pandas as pd
import zipfile
import shutil
from tqdm import tqdm
from shutil import copytree
import numpy as np
import re

In [2]:
#Unzip bundles_java.zip
def unzip_bundles(zip_file_path,extraction_directory):
    if not os.path.exists(extraction_directory):
        os.mkdir(extraction_directory)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extraction_directory)
    else:
        return

In [3]:
def getTestMethods(file_path):
    java_code= open(file_path,encoding="utf-8")
    test_methods=0
    for line in java_code.readlines():
        if "@test" in line.lower():
            test_methods+=1

    return test_methods

In [4]:
def getJavaClassMethodsRE(file_path):
    # Count methods
    with open(file_path, 'r', encoding="utf-8") as file:
        java_code = file.read()
    method_pattern = re.compile(r'\s*(public|private|protected|static|\s)*[\w<>,\[\]]+\s+(\w+)\s*\([^)]*\)\s*(throws\s+\w+(\s*,\s*\w+)*)?\s*\{')
    methods = method_pattern.findall(java_code)
    method_count = len(methods)

    # Count classes
    class_pattern = re.compile(r'\s*(public|private|protected|abstract|final|\s)*(\bclass\b|\binterface\b|\benum\b)\s+(\w+)(\s+extends\s+\w+)?(\s+implements\s+\w+(,\s+\w+)*)?\s*\{')
    classes = class_pattern.findall(java_code)
    class_count = len(classes)

    return class_count,method_count

In [5]:
def sorted_bundles_testMethods(main_dir,outputCSV):
    bundles=[folder for folder in os.listdir(main_dir) if os.path.isdir(pjoin(main_dir,folder)) and not folder.startswith(".")]
    bundle_info=pd.DataFrame([],columns=["bundle","#java files","#methods"])
    for bundle in tqdm(bundles):
        codeFiles = [file for file in os.listdir(pjoin(main_dir,bundle))]
        methodCount=0
        file_details={}
        for file in codeFiles:
            mtds=getTestMethods(pjoin(main_dir,bundle,file))
            file_details[file]={}
            file_details[file]["methods"]=mtds
        sorted_file_details = dict(sorted(file_details.items(), key=lambda item: (-item[1]['methods'])))

        methodCount+=sum([sorted_file_details[file]["methods"] for file in sorted_file_details])
        bundle_info.loc[len(bundle_info)]=[bundle,len(codeFiles),methodCount]

    #Sorting and exporting dataframe into csv
    sorted_bundle_info = bundle_info.sort_values(by='#methods', ascending=[False])
    sorted_bundle_info.to_csv(outputCSV)
    return sorted_bundle_info

In [6]:
def sorted_bundles_method_class(main_dir,outputCSV):
    bundles=[folder for folder in os.listdir(main_dir) if os.path.isdir(pjoin(main_dir,folder)) and not folder.startswith(".")]
    bundle_info=pd.DataFrame([],columns=["bundle","#java files","#classes","#methods"])
    for bundle in tqdm(bundles):
        codeFiles = [file for file in os.listdir(pjoin(main_dir,bundle))]
        methodCount=0
        classCount=0
        file_details={}
        for file in codeFiles:
            classes,mtds=getJavaClassMethodsRE(pjoin(main_dir,bundle,file))
            file_details[file]={}
            file_details[file]["classes"]=classes
            file_details[file]["methods"]=mtds
        sorted_file_details = dict(sorted(file_details.items(), key=lambda item: (-item[1]['methods'], item[1]['classes'])))

        methodCount+=sum([sorted_file_details[file]["methods"] for file in sorted_file_details])
        classCount+=sum([sorted_file_details[file]["classes"] for file in sorted_file_details])
        bundle_info.loc[len(bundle_info)]=[bundle,len(codeFiles),classCount,methodCount]

    #Sorting and exporting dataframe into csv
    sorted_bundle_info = bundle_info.sort_values(by=['#methods',"#classes"], ascending=[False,False])
    sorted_bundle_info.to_csv(outputCSV)
    return sorted_bundle_info

In [None]:
getJavaClassMethodsRE(pjoin("bundles_java","thing"))

In [7]:
zip_file = "bundles_java.zip"
directory = "bundles_java"
unzip_bundles(zip_file,directory)

In [8]:
bundles_sorted_method_class_df=sorted_bundles_method_class(directory,"sorted_bundles_method_class.csv")

100%|██████████| 181/181 [00:17<00:00, 10.19it/s]


In [9]:
bundles_sorted_testMethods_df=sorted_bundles_testMethods(directory,"sorted_bundles_testMethods.csv")

100%|██████████| 181/181 [00:00<00:00, 194.54it/s]


In [10]:
daniel_bundles=["mielecloud","rfxcom","lcn","loxone","modbus","livisismarthome","boschshc","mqtt","hue","dynamodb",
"enigma2","nest","bluetooth","onewire","dmx","tplinksmarthome","hueemulation","astro","deutschebahn",
"avmfritz","systeminfo","yamahareceiver","digitalstrom","upnpcontrol","wemo"]

In [11]:
top25_method_class=list(bundles_sorted_method_class_df["bundle"])[:25]
top25_testMethods=list(bundles_sorted_testMethods_df["bundle"])[:25]

daniel_vs_method_class= list(np.intersect1d(daniel_bundles,top25_method_class))
daniel_vs_testMethods= list(np.intersect1d(daniel_bundles,top25_testMethods))
method_class_vs_testMethods=list(np.intersect1d(top25_method_class, top25_testMethods))

print("Intersections")
print("Between daniel and method_class: ",len(daniel_vs_method_class),end="\t")
print(daniel_vs_method_class)

print("Between daniel and testMethods: ",len(daniel_vs_testMethods),end="\t")
print(daniel_vs_testMethods)

print("Between method_class and testMethods: ",len(method_class_vs_testMethods),end="\t")
print(method_class_vs_testMethods)

Intersections
Between daniel and method_class:  13	['bluetooth', 'boschshc', 'dynamodb', 'hue', 'hueemulation', 'lcn', 'livisismarthome', 'loxone', 'mielecloud', 'modbus', 'mqtt', 'nest', 'rfxcom']
Between daniel and testMethods:  11	['bluetooth', 'boschshc', 'enigma2', 'hue', 'lcn', 'livisismarthome', 'loxone', 'mielecloud', 'modbus', 'mqtt', 'rfxcom']
Between method_class and testMethods:  20	['analysis', 'automation', 'bluetooth', 'boschshc', 'config', 'hue', 'internal', 'io', 'items', 'lcn', 'library', 'livisismarthome', 'loxone', 'mielecloud', 'modbus', 'model', 'mqtt', 'protocol', 'rfxcom', 'thing']


In [None]:
prev_list["systeminfo"
"webthing"
"jdbc"
"nest"
"feed"
"ephemeris"
"lcn"
"digitalstrom"
"omnikinverter"
"mqtt"
"sensorcommunity"
"icloud"
"velux"
"dbquery"
"pushover"
"io"]