In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET


def parse_xml_file(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    problems = root.findall("problem")

    problem_data = {}
    for problem in problems:
        problem_id = problem.find("problem_class").get("id")
        problem_data[problem_id] = problem_data.get(problem_id, 0) + 1

    return problem_data


def process_assignment(assignment_path):
    assignment_name = os.path.basename(assignment_path)
    student_folders = [f for f in os.listdir(assignment_path) if f.startswith("Student")]
    problem_names = set()
    student_data = {}
    for student_folder in student_folders:
        student_id = int(student_folder.split(" ")[1])
        student_files = os.listdir(os.path.join(assignment_path, student_folder))
        problem_counts = {}
        for file in student_files:
            if file.endswith(".xml"):
                xml_path = os.path.join(assignment_path, student_folder, file)
                problem_data = parse_xml_file(xml_path)
                for problem_name, count in problem_data.items():
                    problem_counts[problem_name] = problem_counts.get(problem_name, 0) + count
                    problem_names.add(problem_name)
        student_data[student_id] = problem_counts
    return assignment_name, student_data, problem_names


def create_table(assignment_paths):
    problem_names = set()
    table_data = {}
    for assignment_path in assignment_paths:
        assignment_name, student_data, assignment_problem_names = process_assignment(assignment_path)
        problem_names.update(assignment_problem_names)
        for student_id, problem_counts in student_data.items():
            if student_id not in table_data:
                table_data[student_id] = {}
            for problem_name, count in problem_counts.items():
                table_data[student_id][problem_name] = table_data[student_id].get(problem_name, 0) + count

    table_df = pd.DataFrame.from_dict(table_data, orient="index")
    table_df.index.name = "student"
    table_df = table_df.fillna(0)
    table_df = table_df.astype(int)
    table_df = table_df.reindex(sorted(table_df.columns), axis=1)

    return table_df


if __name__ == "__main__":
    assignment_paths = [f for f in os.listdir(".") if f.startswith("Homework")]
    table = create_table(assignment_paths)
    print(table)

            AccessStaticViaInstance  AnonymousClassComplexity  \
student                                                         
2        2                        0                         0   
4        1                        0                         2   
6        1                        0                         0   
8        1                        0                         0   
9        1                        0                         0   
10       1                        0                         1   
11       2                        0                         1   
12       1                        0                         1   
13       3                        0                         0   
14       4                        0                         2   
18       2                        0                         3   
19       1                        0                         1   
20       1                        0                         0   
22       1               

In [2]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,AccessStaticViaInstance,AnonymousClassComplexity,AnonymousClassMethodCount,BooleanMethodIsAlwaysInverted,CStyleArrayDeclaration,CanBeFinal,CatchMayIgnoreException,ChainedMethodCall,ClassCanBeRecord,...,UnnecessaryContinue,UnnecessaryLocalVariable,UnnecessaryModifier,UnnecessaryReturn,UnnecessarySemicolon,UnusedAssignment,UnusedReturnValue,UtilityClass,WrongPackageStatement,unused
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,0,0,0,0,0,16,0,0,0,...,0,0,0,0,0,1,0,1,11,71
4,1,0,2,0,0,0,14,0,4,0,...,0,0,0,0,0,1,1,0,10,37
6,1,0,0,0,0,0,12,0,3,1,...,0,0,0,0,0,1,1,1,8,15
8,1,0,0,0,0,0,3,0,3,2,...,0,0,0,0,0,0,0,0,4,15
9,1,0,0,0,0,0,8,0,2,1,...,0,1,0,0,0,0,0,1,7,37
10,1,0,1,0,0,0,10,0,3,0,...,0,0,0,0,1,2,0,1,8,24
11,2,0,1,1,0,0,4,0,4,1,...,0,0,0,0,1,0,0,1,8,17
12,1,0,1,0,0,0,20,0,4,0,...,0,2,9,0,0,0,0,1,9,8
13,3,0,0,0,0,0,12,0,2,0,...,0,0,0,0,0,0,0,2,12,2
14,4,0,2,0,0,0,31,17,0,0,...,0,1,0,0,0,2,0,2,11,28


In [6]:
# Compute the sum of each column
sums = table.sum()

max_columns = sums.nlargest(10).index.tolist()

# Print the results
print('Sum of each column:')
print(sums)
print('Top 10 columns with the highest sum:', max_columns)

Sum of each column:
                                  33
AccessStaticViaInstance            3
AnonymousClassComplexity          15
AnonymousClassMethodCount          5
BooleanMethodIsAlwaysInverted      2
                                ... 
UnusedAssignment                  17
UnusedReturnValue                  6
UtilityClass                      20
WrongPackageStatement            187
unused                           542
Length: 79, dtype: int64
Top 10 columns with the highest sum: ['MissingJavadoc', 'unused', 'SingleClassImport', 'CanBeFinal', 'WrongPackageStatement', 'MultipleReturnPointsPerMethod', 'MethodWithMultipleLoops', 'FieldMayBeFinal', 'ChainedMethodCall', 'FeatureEnvy']


In [5]:
codeSmells=table[max_columns]
codeSmells

Unnamed: 0_level_0,MissingJavadoc,unused,SingleClassImport,CanBeFinal,WrongPackageStatement,MultipleReturnPointsPerMethod,MethodWithMultipleLoops,FieldMayBeFinal,ChainedMethodCall,FeatureEnvy
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,102,71,14,16,11,7,9,10,0,3
4,34,37,40,14,10,5,7,12,4,3
6,51,15,12,12,8,8,9,10,3,9
8,32,15,2,3,4,5,3,0,3,1
9,38,37,4,8,7,5,5,2,2,0
10,23,24,36,10,8,5,3,4,3,0
11,66,17,4,4,8,9,3,0,4,6
12,18,8,7,20,9,8,4,1,4,5
13,33,2,7,12,12,9,7,6,2,7
14,47,28,18,31,11,10,6,0,0,1


In [15]:
codeSmells.to_csv('hw4.csv', index=False)