In [1]:
import importlib.util
import time
import os
import subprocess
import tracemalloc
import pandas as pd
from radon.complexity import cc_visit
from radon.metrics import mi_visit

class FunctionTester:
    def __init__(self, test_cases):
        self.test_cases = test_cases
        self.results_df = pd.DataFrame(columns=[
            "File", "Function", "Accuracy", "Time_s", "Memory_KB", "Complexity", "Maintainability", "Pylint_Score"
        ])
    
    def load_function(self, file_path):
        """Load the main function from a Python file (auto-detect function name)."""
        spec = importlib.util.spec_from_file_location("module", file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        # Find candidate functions
        candidates = [attr for attr in dir(module) if callable(getattr(module, attr)) and not attr.startswith("_")]
        for func_name in candidates:
            if any(keyword in func_name.lower() for keyword in ['task','func','main','solution']):
                return getattr(module, func_name), func_name
        if candidates:
            return getattr(module, candidates[0]), candidates[0]
        raise AttributeError(f"No callable function found in {file_path}")
    
    def run_test(self, func, test_cases, repeat: int = 3):
        """Run all test cases, measure accuracy, peak memory, and time."""
        correct = 0
        peak_memory_kb = 0
        start_total = time.perf_counter()
        
        for idx, test_case in enumerate(test_cases):
            try:
                args, expected = test_case if isinstance(test_case, tuple) else (test_case, test_case[1])
                if not isinstance(args, tuple):
                    args = (args,)
                
                # Repeat function execution for smoother timing
                mem_peak_case = 0
                result = None
                for _ in range(repeat):
                    tracemalloc.start()
                    start = time.perf_counter()
                    result = func(*args)
                    end = time.perf_counter()
                    current, peak = tracemalloc.get_traced_memory()
                    tracemalloc.stop()
                    mem_peak_case = max(mem_peak_case, peak / 1024.0)
                
                peak_memory_kb = max(peak_memory_kb, mem_peak_case)
                if self.compare_results(result, expected):
                    correct += 1
                else:
                    print(f"‚ùå Input {args} ‚Üí Expected {expected}, Got {result}")
                
            except Exception as e:
                print(f"‚ùå Error with input {args}: {e}")
        
        end_total = time.perf_counter()
        execution_time = end_total - start_total
        accuracy = correct / len(test_cases)
        return accuracy, execution_time, peak_memory_kb
    
    def compare_results(self, result, expected):
        if result == expected:
            return True
        if isinstance(result, (list,set)) and isinstance(expected, (list,set)):
            return sorted(result) == sorted(expected)
        if isinstance(result, float) and isinstance(expected, float):
            return abs(result-expected)<1e-9
        return False
    
    def analyze_code_quality(self, file_path):
        """Complexity, maintainability (radon), pylint."""
        try:
            with open(file_path,'r',encoding='utf-8') as f:
                code = f.read()
            complexity = sum(block.complexity for block in cc_visit(code))
            maintainability = mi_visit(code, True)
            pylint_score = self.get_pylint_score(file_path)
            return complexity, maintainability, pylint_score
        except Exception as e:
            print(f"‚ö†Ô∏è Code analysis failed for {file_path}: {e}")
            return 0,0,0
    
    def get_pylint_score(self, file_path):
        try:
            result = subprocess.run(
                ["pylint", file_path, "--score=y", "--disable=R,C,W"],
                capture_output=True, text=True, timeout=30
            )
            for line in result.stdout.splitlines():
                if "rated at" in line:
                    return float(line.split("rated at")[1].split("/")[0].strip())
        except Exception:
            pass
        return 0.0
    
    def record_results(self, file_path, function_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score):
        self.results_df = pd.concat([self.results_df, pd.DataFrame([{
            "File": os.path.basename(file_path),
            "Function": function_name,
            "Accuracy": accuracy,
            "Time_s": time_s,
            "Memory_KB": memory_kb,
            "Complexity": complexity,
            "Maintainability": maintainability,
            "Pylint_Score": pylint_score
        }])], ignore_index=True)
    
    def test_file(self, file_path):
        if not os.path.exists(file_path):
            print(f"‚ùå {file_path} not found")
            return
        func, func_name = self.load_function(file_path)
        test_cases = self.test_cases.get(os.path.basename(file_path).replace(".py",""), [])
        accuracy, time_s, memory_kb = self.run_test(func, test_cases)
        complexity, maintainability, pylint_score = self.analyze_code_quality(file_path)
        self.record_results(file_path, func_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score)
        print(f"‚úÖ Tested {file_path} ‚Üí Accuracy {accuracy:.1%}, Time {time_s:.4f}s, Memory {memory_kb:.2f}KB")
    
    def run_all(self, participant_dir="."):
        for filename in self.test_cases.keys():
            file_path = os.path.join(participant_dir, f"{filename}.py")
            self.test_file(file_path)
        print("\nüìä SUMMARY RESULTS")
        print(self.results_df)
        return self.results_df

# Example main
def main():
    TEST_CASES = {
        "task0": [
            ([1,2,3,4,5], 2),
            ([], 0),
            ([2,4,6,8], 4),
            ([1,3,5], 0),
        ],
        "task0ai": [
            ([1,2,3,4,5],[1,3,5]),
            ([], []),
            ([2,4,6,8], []),
            ([1,3,5],[1,3,5]),
        ],
    }
    tester = FunctionTester(TEST_CASES)
    results_df = tester.run_all()
    results_df.to_csv("task0_results.csv", index=False)

if __name__ == "__main__":
    main()


  self.results_df = pd.concat([self.results_df, pd.DataFrame([{


‚úÖ Tested .\task0.py ‚Üí Accuracy 100.0%, Time 0.0001s, Memory 0.07KB
‚úÖ Tested .\task0ai.py ‚Üí Accuracy 100.0%, Time 0.0000s, Memory 0.08KB

üìä SUMMARY RESULTS
         File Function  Accuracy    Time_s  Memory_KB Complexity  \
0    task0.py    task0       1.0  0.000054   0.070312          3   
1  task0ai.py  task0ai       1.0  0.000045   0.078125          3   

   Maintainability  Pylint_Score  
0        72.431958          10.0  
1        74.285292          10.0  


In [2]:
import importlib.util
import time
import os
import subprocess
import tracemalloc
import pandas as pd
from typing import List, Dict, Any, Tuple
from radon.complexity import cc_visit
from radon.metrics import mi_visit

class Task1Tester:
    def __init__(self, test_cases):
        self.test_cases = test_cases
        self.results_df = pd.DataFrame(columns=[
            "File", "Function", "Accuracy", "Time_s", "Memory_KB", "Complexity", "Maintainability", "Pylint_Score"
        ])
    
    def load_function(self, file_path):
        """Load the main function from a Python file (auto-detect function name)."""
        spec = importlib.util.spec_from_file_location("module", file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        # Find candidate functions
        candidates = [attr for attr in dir(module) if callable(getattr(module, attr)) and not attr.startswith("_")]
        for func_name in candidates:
            if any(keyword in func_name.lower() for keyword in ['task','func','main','solution']):
                return getattr(module, func_name), func_name
        if candidates:
            return getattr(module, candidates[0]), candidates[0]
        raise AttributeError(f"No callable function found in {file_path}")
    
    def run_test(self, func, test_cases, repeat: int = 3):
        """Run all test cases, measure accuracy, peak memory, and time."""
        correct = 0
        peak_memory_kb = 0
        start_total = time.perf_counter()
        
        for idx, test_case in enumerate(test_cases):
            try:
                (input_args, valid_answers) = test_case
                nums, target = input_args
                
                # Repeat function execution for smoother timing
                mem_peak_case = 0
                result = None
                for _ in range(repeat):
                    tracemalloc.start()
                    start = time.perf_counter()
                    result = func(nums, target)
                    end = time.perf_counter()
                    current, peak = tracemalloc.get_traced_memory()
                    tracemalloc.stop()
                    mem_peak_case = max(mem_peak_case, peak / 1024.0)
                
                peak_memory_kb = max(peak_memory_kb, mem_peak_case)
                
                # Handle case where function returns None instead of empty list
                if result is None and valid_answers == [[]]:
                    result = []
                
                if self.is_valid_answer(result, nums, target, valid_answers):
                    correct += 1
                else:
                    print(f"‚ùå Input nums={nums}, target={target} ‚Üí Expected one of {valid_answers}, Got {result}")
                
            except Exception as e:
                print(f"‚ùå Error with input nums={nums}, target={target}: {e}")
        
        end_total = time.perf_counter()
        execution_time = end_total - start_total
        accuracy = correct / len(test_cases)
        return accuracy, execution_time, peak_memory_kb
    
    def is_valid_answer(self, result, nums, target, valid_answers):
        """Check if the result is any of the valid answers."""
        if result == [] and valid_answers == [[]]:
            return True
        
        if not isinstance(result, list) or len(result) != 2:
            return False
        
        i, j = result
        
        # Check indices are valid
        if not (0 <= i < len(nums) and 0 <= j < len(nums) and i != j):
            return False
        
        # Check if this matches any valid answer (order doesn't matter for indices)
        for valid in valid_answers:
            if valid == []:
                continue
            if set(result) == set(valid):
                return True
        
        return False
    
    def analyze_code_quality(self, file_path):
        """Complexity, maintainability (radon), pylint."""
        try:
            with open(file_path,'r',encoding='utf-8') as f:
                code = f.read()
            complexity = sum(block.complexity for block in cc_visit(code))
            maintainability = mi_visit(code, True)
            pylint_score = self.get_pylint_score(file_path)
            return complexity, maintainability, pylint_score
        except Exception as e:
            print(f"‚ö†Ô∏è Code analysis failed for {file_path}: {e}")
            return 0,0,0
    
    def get_pylint_score(self, file_path):
        try:
            result = subprocess.run(
                ["pylint", file_path, "--score=y", "--disable=R,C,W"],
                capture_output=True, text=True, timeout=30
            )
            for line in result.stdout.splitlines():
                if "rated at" in line:
                    return float(line.split("rated at")[1].split("/")[0].strip())
        except Exception:
            pass
        return 0.0
    
    def record_results(self, file_path, function_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score):
        self.results_df = pd.concat([self.results_df, pd.DataFrame([{
            "File": os.path.basename(file_path),
            "Function": function_name,
            "Accuracy": accuracy,
            "Time_s": time_s,
            "Memory_KB": memory_kb,
            "Complexity": complexity,
            "Maintainability": maintainability,
            "Pylint_Score": pylint_score
        }])], ignore_index=True)
    
    def test_file(self, file_path):
        if not os.path.exists(file_path):
            print(f"‚ùå {file_path} not found")
            return
        func, func_name = self.load_function(file_path)
        test_cases = self.test_cases.get(os.path.basename(file_path).replace(".py",""), [])
        accuracy, time_s, memory_kb = self.run_test(func, test_cases)
        complexity, maintainability, pylint_score = self.analyze_code_quality(file_path)
        self.record_results(file_path, func_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score)
        print(f"‚úÖ Tested {file_path} ‚Üí Accuracy {accuracy:.1%}, Time {time_s:.4f}s, Memory {memory_kb:.2f}KB")
    
    def run_all(self, participant_dir="."):
        for filename in self.test_cases.keys():
            file_path = os.path.join(participant_dir, f"{filename}.py")
            self.test_file(file_path)
        print("\nüìä SUMMARY RESULTS")
        print(self.results_df)
        return self.results_df

# Example main
def main():
    TEST_CASES = {
        "task1": [
            # (input, [all possible valid answers])
            (([2, 7, 11, 15], 9), [[0, 1],[1, 0]]),  # Only 2+7=9
            (([3, 2, 4], 6), [[1, 2]]),  # Only 2+4=6  
            (([3, 3], 6), [[0, 1]]),  # Only 3+3=6
            (([1, 2, 3, 4], 5), [[0, 3], [1, 2]]),  # 1+4=5 OR 2+3=5
            (([1, 2, 3, 4], 10), [[]]),  # No solution
            (([], 5), [[]]),  # Empty list
        ],
        "task1ai": [
            (([5, 3], 2), [[0, 1], [1, 0]]),  # 5-3=2 OR 3-5=-2? Wait, let's check the logic
            (([10, 8, 2], 2), [[0, 1], [1, 0]]),  # 10-8=2 OR 8-10=-2?
            (([7, 1, 5, 3], 4), [[0, 3], [2, 1]]),  # 7-3=4 OR 5-1=4
            (([15, 10, 5], 5), [[1, 2], [0, 1]]),  # 10-5=5 OR 15-10=5
            (([8, 4], 4), [[0, 1]]),  # 8-4=4
            (([1, 2, 3], 5), [[]]),  # No pairs with difference of 5
            (([], 5), [[]]),  # Empty list
        ],
    }
    tester = Task1Tester(TEST_CASES)
    results_df = tester.run_all()
    results_df.to_csv("task1_results.csv", index=False)

if __name__ == "__main__":
    main()

  self.results_df = pd.concat([self.results_df, pd.DataFrame([{


‚úÖ Tested .\task1.py ‚Üí Accuracy 100.0%, Time 0.0001s, Memory 0.11KB
‚úÖ Tested .\task1ai.py ‚Üí Accuracy 100.0%, Time 0.0001s, Memory 0.11KB

üìä SUMMARY RESULTS
         File Function  Accuracy    Time_s  Memory_KB Complexity  \
0    task1.py    task1       1.0  0.000106   0.109375          4   
1  task1ai.py  task1ai       1.0  0.000108   0.109375          5   

   Maintainability  Pylint_Score  
0        72.464997          10.0  
1        70.913972          10.0  


In [3]:
import importlib.util
import time
import os
import subprocess
import tracemalloc
import pandas as pd
from radon.complexity import cc_visit
from radon.metrics import mi_visit

class Task2Tester:
    def __init__(self, test_cases):
        self.test_cases = test_cases
        self.results_df = pd.DataFrame(columns=[
            "File", "Function", "Accuracy", "Time_s", "Memory_KB", "Complexity", "Maintainability", "Pylint_Score"
        ])
    
    def load_function(self, file_path):
        """Load the main function from a Python file (auto-detect function name)."""
        spec = importlib.util.spec_from_file_location("module", file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        # Find candidate functions
        candidates = [attr for attr in dir(module) if callable(getattr(module, attr)) and not attr.startswith("_")]
        for func_name in candidates:
            if any(keyword in func_name.lower() for keyword in ['task','func','main','solution']):
                return getattr(module, func_name), func_name
        if candidates:
            return getattr(module, candidates[0]), candidates[0]
        raise AttributeError(f"No callable function found in {file_path}")
    
    def run_test(self, func, test_cases, repeat: int = 3):
        """Run all test cases, measure accuracy, peak memory, and time."""
        correct = 0
        peak_memory_kb = 0
        start_total = time.perf_counter()
        
        for idx, test_case in enumerate(test_cases):
            try:
                input_arr, expected = test_case
                
                # Repeat function execution for smoother timing
                mem_peak_case = 0
                result = None
                for _ in range(repeat):
                    arr_copy = input_arr.copy()  # Avoid modifying original test data
                    tracemalloc.start()
                    start = time.perf_counter()
                    result = func(arr_copy)
                    end = time.perf_counter()
                    current, peak = tracemalloc.get_traced_memory()
                    tracemalloc.stop()
                    mem_peak_case = max(mem_peak_case, peak / 1024.0)
                
                peak_memory_kb = max(peak_memory_kb, mem_peak_case)
                
                if self.compare_results(result, expected):
                    correct += 1
                else:
                    print(f"‚ùå Input {input_arr} ‚Üí Expected {expected}, Got {result}")
                
            except Exception as e:
                print(f"‚ùå Error with input {input_arr}: {e}")
        
        end_total = time.perf_counter()
        execution_time = end_total - start_total
        accuracy = correct / len(test_cases)
        return accuracy, execution_time, peak_memory_kb
    
    def compare_results(self, result, expected):
        """Compare sorting results - exact list comparison."""
        if result == expected:
            return True
        # For sorting, we need exact match including order
        return False
    
    def analyze_code_quality(self, file_path):
        """Complexity, maintainability (radon), pylint."""
        try:
            with open(file_path,'r',encoding='utf-8') as f:
                code = f.read()
            complexity = sum(block.complexity for block in cc_visit(code))
            maintainability = mi_visit(code, True)
            pylint_score = self.get_pylint_score(file_path)
            return complexity, maintainability, pylint_score
        except Exception as e:
            print(f"‚ö†Ô∏è Code analysis failed for {file_path}: {e}")
            return 0,0,0
    
    def get_pylint_score(self, file_path):
        try:
            result = subprocess.run(
                ["pylint", file_path, "--score=y", "--disable=R,C,W"],
                capture_output=True, text=True, timeout=30
            )
            for line in result.stdout.splitlines():
                if "rated at" in line:
                    return float(line.split("rated at")[1].split("/")[0].strip())
        except Exception:
            pass
        return 0.0
    
    def record_results(self, file_path, function_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score):
        self.results_df = pd.concat([self.results_df, pd.DataFrame([{
            "File": os.path.basename(file_path),
            "Function": function_name,
            "Accuracy": accuracy,
            "Time_s": time_s,
            "Memory_KB": memory_kb,
            "Complexity": complexity,
            "Maintainability": maintainability,
            "Pylint_Score": pylint_score
        }])], ignore_index=True)
    
    def test_file(self, file_path):
        if not os.path.exists(file_path):
            print(f"‚ùå {file_path} not found")
            return
        func, func_name = self.load_function(file_path)
        test_cases = self.test_cases.get(os.path.basename(file_path).replace(".py",""), [])
        accuracy, time_s, memory_kb = self.run_test(func, test_cases)
        complexity, maintainability, pylint_score = self.analyze_code_quality(file_path)
        self.record_results(file_path, func_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score)
        print(f"‚úÖ Tested {file_path} ‚Üí Accuracy {accuracy:.1%}, Time {time_s:.4f}s, Memory {memory_kb:.2f}KB")
    
    def run_all(self, participant_dir="."):
        for filename in self.test_cases.keys():
            file_path = os.path.join(participant_dir, f"{filename}.py")
            self.test_file(file_path)
        print("\nüìä SUMMARY RESULTS")
        print(self.results_df)
        return self.results_df

# Example main
def main():
    TEST_CASES = {
        "task2": [
            ([10, 7, 8, 9, 1, 5], [1, 5, 7, 8, 9, 10]),
            ([64, 34, 25, 12, 22, 11, 90], [11, 12, 22, 25, 34, 64, 90]),
            ([5, 2, 8, 1, 9], [1, 2, 5, 8, 9]),
            ([1], [1]),
            ([], []),
            ([3, 3, 3, 3], [3, 3, 3, 3]),
            ([2, 1], [1, 2]),
            ([5, 4, 3, 2, 1], [1, 2, 3, 4, 5]),
        ],
        "task2ai": [
            ([10, 7, 8, 9, 1, 5], [10, 9, 8, 7, 5, 1]),
            ([64, 34, 25, 12, 22, 11, 90], [90, 64, 34, 25, 22, 12, 11]),
            ([5, 2, 8, 1, 9], [9, 8, 5, 2, 1]),
            ([1], [1]),
            ([], []),
            ([3, 3, 3, 3], [3, 3, 3, 3]),
            ([2, 1], [2, 1]),
            ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
        ],
    }
    tester = Task2Tester(TEST_CASES)
    results_df = tester.run_all()
    results_df.to_csv("task2_results.csv", index=False)

if __name__ == "__main__":
    main()

  self.results_df = pd.concat([self.results_df, pd.DataFrame([{


‚úÖ Tested .\task2.py ‚Üí Accuracy 100.0%, Time 0.0002s, Memory 0.16KB
‚úÖ Tested .\task2ai.py ‚Üí Accuracy 100.0%, Time 0.0002s, Memory 0.16KB

üìä SUMMARY RESULTS
         File Function  Accuracy    Time_s  Memory_KB Complexity  \
0    task2.py    task2       1.0  0.000246   0.164062          4   
1  task2ai.py  task2ai       1.0  0.000229   0.164062          4   

   Maintainability  Pylint_Score  
0          67.3218          10.0  
1          67.3218          10.0  


In [4]:
import importlib.util
import time
import os
import pandas as pd
import subprocess
import tracemalloc
from typing import List, Any, Tuple

# Added for better complexity/maintainability
from radon.complexity import cc_visit
from radon.metrics import mi_visit

class Task3Tester:
    def __init__(self):
        self.csv_path = "Salary_Dataset.csv"
        if not os.path.exists(self.csv_path):
            raise FileNotFoundError(f"Dataset file {self.csv_path} not found")
        
        self.task3_expected = self.calculate_task3_expected()
        self.task3ai_expected = self.calculate_task3ai_expected()
        # Prepare DataFrame to store results
        self.results_df = pd.DataFrame(columns=[
            "Task", "Accuracy", "Time_s", "Memory_KB", "Complexity", 
            "Maintainability", "Pylint_Score"
        ])
    
    def calculate_task3_expected(self):
        df = pd.read_csv(self.csv_path)
        android_avg = round(df[df['Job Title'] == 'Android Developer']['Salary'].mean(), 2)
        top_companies = df.groupby('Company Name')['Salary'].mean().sort_values(ascending=False).head(5)
        return android_avg, top_companies
    
    def calculate_task3ai_expected(self):
        df = pd.read_csv(self.csv_path)
        company_name = df.loc[df['Salary'].idxmax()]['Company Name']
        top_cities = df[df['Employment Status'].str.contains('Full Time', na=False)]['Location'].value_counts().head(10)
        return company_name, top_cities
    
    def load_function(self, file_path: str, function_name: str) -> Any:
        spec = importlib.util.spec_from_file_location("module", file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return getattr(module, function_name)
    
    def measure_peak_memory(self, func, *args, **kwargs) -> Tuple[float, Any]:
        """Measure peak memory usage across the function call in KB."""
        tracemalloc.start()
        start_time = time.time()
        result = func(*args, **kwargs)
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        execution_time = time.time() - start_time
        peak_kb = peak / 1024
        return peak_kb, execution_time, result
    
    def compare_values(self, result, expected) -> bool:
        try:
            return abs(result - expected) < 0.01 if isinstance(result, (int, float)) else result == expected
        except:
            return False
    
    def compare_lists_to_series(self, result_list, expected_series) -> bool:
        try:
            expected_list = expected_series.index.tolist()
            if set(result_list) != set(expected_list) or len(result_list) != len(expected_list):
                return False
            # Allow ordering differences for ties
            result_salaries = [expected_series[company] for company in result_list]
            expected_salaries = expected_series.values.tolist()
            return all(result_salaries[i] >= result_salaries[i+1] for i in range(len(result_salaries)-1))
        except:
            return False
    
    def run_tests(self, func, expected_results, task_name: str) -> Tuple[float, float, float]:
        correct = 0
        total_tests = 2
        memory_kb, exec_time, result = self.measure_peak_memory(func, self.csv_path)
        
        try:
            if task_name == "task3":
                android_avg, top_companies = result
                expected_avg, expected_companies = expected_results
                if self.compare_values(android_avg, expected_avg):
                    correct += 1
                if (isinstance(top_companies, list) and self.compare_lists_to_series(top_companies, expected_companies)) or \
                   (isinstance(top_companies, pd.Series) and top_companies.equals(expected_companies)):
                    correct += 1
            
            elif task_name == "task3ai":
                company_name, top_cities = result
                expected_company, expected_cities = expected_results
                if isinstance(company_name, pd.Series):
                    company_name = company_name.iloc[0] if len(company_name) > 0 else ""
                if company_name == expected_company:
                    correct += 1
                if (isinstance(top_cities, list) and self.compare_lists_to_series(top_cities, expected_cities)) or \
                   (isinstance(top_cities, pd.Series) and top_cities.equals(expected_cities)):
                    correct += 1
        except Exception:
            correct = 0
        
        accuracy = correct / total_tests
        return accuracy, exec_time, memory_kb
    
    def analyze_code_quality(self, file_path: str) -> Tuple[float, float, float]:
        try:
            with open(file_path, 'r') as f:
                code = f.read()
            # Cyclomatic complexity
            functions_cc = cc_visit(code)
            complexity = max([func.complexity for func in functions_cc], default=0)
            # Maintainability index
            maintainability = mi_visit(code, True)
            # Pylint score
            pylint_score = self.get_pylint_score(file_path)
            return complexity, maintainability, pylint_score
        except Exception:
            return 0, 0, 0
    
    def get_pylint_score(self, file_path: str) -> float:
        try:
            result = subprocess.run(['pylint', file_path, '--score=y', '--disable=R,C,W'],
                                    capture_output=True, text=True, timeout=30)
            for line in result.stdout.splitlines():
                if 'rated at' in line:
                    return float(line.split('rated at')[1].split('/')[0].strip())
        except:
            return 0.0
        return 0.0
    
    def test_task(self, file_path: str, func_name: str, expected_results, task_name: str):
        if not os.path.exists(file_path):
            print(f"‚ùå {file_path} not found")
            return
        func = self.load_function(file_path, func_name)
        accuracy, exec_time, memory_kb = self.run_tests(func, expected_results, task_name)
        complexity, maintainability, pylint_score = self.analyze_code_quality(file_path)
        
        # Store results
        self.results_df = pd.concat([self.results_df, pd.DataFrame([{
            "Task": task_name,
            "Accuracy": accuracy,
            "Time_s": exec_time,
            "Memory_KB": memory_kb,
            "Complexity": complexity,
            "Maintainability": maintainability,
            "Pylint_Score": pylint_score
        }])], ignore_index=True)
    
    def run_all_tests(self):
        self.test_task("task3.py", "task3", self.task3_expected, "task3")
        self.test_task("task3ai.py", "task3ai", self.task3ai_expected, "task3ai")
        print("\n‚úÖ All results stored in DataFrame 'results_df' for export.")
        return self.results_df

def main():
    tester = Task3Tester()
    results_df = tester.run_all_tests()
    print(results_df)
    # Example export
    results_df.to_csv("task3_results.csv", index=False)

if __name__ == "__main__":
    main()


  self.results_df = pd.concat([self.results_df, pd.DataFrame([{



‚úÖ All results stored in DataFrame 'results_df' for export.
      Task  Accuracy    Time_s    Memory_KB Complexity  Maintainability  \
0    task3       1.0  0.044962  5343.778320          1         75.42418   
1  task3ai       1.0  0.040859  5343.432617          1         75.42418   

   Pylint_Score  
0          10.0  
1          10.0  


In [5]:
import importlib.util
import time
import os
import tracemalloc
import subprocess
import pandas as pd
from typing import List, Tuple
from radon.complexity import cc_visit
from radon.metrics import mi_visit

class Task4Tester:
    def __init__(self):
        # Test cases for task4 (Roman to Integer)
        self.task4_test_cases = [
            ("III", 3), ("IV", 4), ("IX", 9), ("LVIII", 58), ("MCMXC", 1990),
            ("MMXXIV", 2024), ("XIV", 14), ("XCIX", 99), ("CDXLIV", 444),
            ("MCMXCIX", 1999), ("I", 1), ("V", 5), ("X", 10), ("L", 50),
            ("C", 100), ("D", 500), ("M", 1000),
        ]
        # Test cases for task4ai (Integer to Roman)
        self.task4ai_test_cases = [
            (3, "III"), (4, "IV"), (9, "IX"), (58, "LVIII"), (1990, "MCMXC"),
            (2024, "MMXXIV"), (14, "XIV"), (99, "XCIX"), (444, "CDXLIV"),
            (1999, "MCMXCIX"), (1, "I"), (5, "V"), (10, "X"), (50, "L"),
            (100, "C"), (500, "D"), (1000, "M"), (3999, "MMMCMXCIX")
        ]
        # Results DataFrame
        self.results_df = pd.DataFrame(columns=[
            "Task", "Accuracy", "Time_s", "Memory_KB", "Complexity", "Maintainability", "Pylint_Score"
        ])
    
    def load_function(self, file_path: str, function_name: str):
        """Load a specific function from a Python file."""
        spec = importlib.util.spec_from_file_location("module", file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return getattr(module, function_name)
    
    def measure_memory_and_time(self, func, *args, repeat: int = 1) -> Tuple[float, float, any]:
        """Measure execution time and peak memory (KB) across repeat runs."""
        peak_memory_kb = 0
        start_total = time.time()
        result = None
        for i in range(repeat):
            tracemalloc.start()
            start = time.time()
            result = func(*args)
            end = time.time()
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            peak_memory_kb = max(peak_memory_kb, peak / 1024)
        end_total = time.time()
        total_time = end_total - start_total
        return total_time, peak_memory_kb, result
    
    def run_tests(self, func, test_cases: List[Tuple], repeat: int = 1) -> Tuple[float, float, float]:
        """Run all test cases and return accuracy, time, and memory usage."""
        correct = 0
        memory_kb = 0
        start_time = time.time()
        for idx, (inp, expected) in enumerate(test_cases):
            try:
                # Measure memory only for the first case
                if idx == 0:
                    exec_time, mem_kb, result = self.measure_memory_and_time(func, inp, repeat=repeat)
                    memory_kb = mem_kb
                else:
                    result = func(inp)
                if result == expected:
                    correct += 1
                else:
                    print(f"‚ùå Input: {inp}, Expected: {expected}, Got: {result}")
            except Exception as e:
                print(f"‚ùå Error with input {inp}: {e}")
        accuracy = correct / len(test_cases)
        end_time = time.time()
        total_time = end_time - start_time
        return accuracy, total_time, memory_kb
    
    def analyze_code_quality(self, file_path: str) -> Tuple[float, float, float]:
        """Analyze code complexity, maintainability, and pylint score."""
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                code = f.read()
            # Cyclomatic complexity using radon
            complexity = sum([block.complexity for block in cc_visit(code)])
            # Maintainability Index
            maintainability = mi_visit(code, True)
            # Pylint score
            pylint_score = self.get_pylint_score(file_path)
            return complexity, maintainability, pylint_score
        except Exception as e:
            print(f"‚ö†Ô∏è Code analysis failed: {e}")
            return 0, 0, 0
    
    def get_pylint_score(self, file_path: str) -> float:
        try:
            result = subprocess.run(
                ["pylint", file_path, "--score=y", "--disable=R,C,W"],
                capture_output=True, text=True, timeout=30
            )
            for line in result.stdout.splitlines():
                if "rated at" in line:
                    score = float(line.split("rated at")[1].split("/")[0].strip())
                    return score
        except Exception:
            pass
        return 0.0
    
    def record_results(self, task_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score):
        self.results_df = pd.concat([self.results_df, pd.DataFrame([{
            "Task": task_name,
            "Accuracy": accuracy,
            "Time_s": time_s,
            "Memory_KB": memory_kb,
            "Complexity": complexity,
            "Maintainability": maintainability,
            "Pylint_Score": pylint_score
        }])], ignore_index=True)
    
    def test_task(self, file_path: str, function_name: str, test_cases: List[Tuple], task_name: str):
        """General testing routine for Roman/Integer conversion tasks."""
        if not os.path.exists(file_path):
            print(f"‚ùå {file_path} not found")
            return
        func = self.load_function(file_path, function_name)
        accuracy, time_s, memory_kb = self.run_tests(func, test_cases, repeat=3)
        complexity, maintainability, pylint_score = self.analyze_code_quality(file_path)
        self.record_results(task_name, accuracy, time_s, memory_kb, complexity, maintainability, pylint_score)
        print(f"\nüìä {task_name.upper()} RESULTS:")
        print(self.results_df[self.results_df["Task"]==task_name])
    
    def test_round_trip(self):
        """Test round-trip Roman <-> Integer conversions."""
        print("\n" + "="*60)
        print("ROUND-TRIP TESTING")
        print("="*60)
        if not os.path.exists("task4.py") or not os.path.exists("task4ai.py"):
            print("‚ùå Both task4.py and task4ai.py required")
            return
        roman_to_int = self.load_function("task4.py", "task4")
        int_to_roman = self.load_function("task4ai.py", "task4ai")
        passed = 0
        round_trip_cases = [1, 4, 9, 49, 99, 499, 999, 1499, 1999, 2499, 2999, 3499, 3999]
        for num in round_trip_cases:
            roman = int_to_roman(num)
            back_to_num = roman_to_int(roman)
            if num == back_to_num:
                passed += 1
                print(f"‚úÖ {num} -> {roman} -> {back_to_num}")
            else:
                print(f"‚ùå {num} -> {roman} -> {back_to_num} (expected {num})")
        accuracy = passed / len(round_trip_cases)
        self.record_results("round_trip", accuracy, 0, 0, 0, 0, 0)
        print(f"\nüìä ROUND-TRIP ACCURACY: {accuracy:.1%}")
    
    def run_all(self):
        self.test_task("task4.py", "task4", self.task4_test_cases, "task4")
        self.test_task("task4ai.py", "task4ai", self.task4ai_test_cases, "task4ai")
        self.test_round_trip()
        print("\n‚úÖ ALL TEST RESULTS:")
        print(self.results_df)

def main():
    tester = Task4Tester()
    tester.run_all()
    # Export results if desired
    tester.results_df.to_csv("task4_results.csv", index=False)

if __name__ == "__main__":
    main()


  self.results_df = pd.concat([self.results_df, pd.DataFrame([{



üìä TASK4 RESULTS:
    Task  Accuracy    Time_s  Memory_KB Complexity  Maintainability  \
0  task4       1.0  0.000057    0.28125          4        63.421356   

   Pylint_Score  
0          10.0  

üìä TASK4AI RESULTS:
      Task  Accuracy    Time_s  Memory_KB Complexity  Maintainability  \
1  task4ai       1.0  0.000076   0.296875          3        65.665012   

   Pylint_Score  
1          10.0  

ROUND-TRIP TESTING
‚úÖ 1 -> I -> 1
‚úÖ 4 -> IV -> 4
‚úÖ 9 -> IX -> 9
‚úÖ 49 -> XLIX -> 49
‚úÖ 99 -> XCIX -> 99
‚úÖ 499 -> CDXCIX -> 499
‚úÖ 999 -> CMXCIX -> 999
‚úÖ 1499 -> MCDXCIX -> 1499
‚úÖ 1999 -> MCMXCIX -> 1999
‚úÖ 2499 -> MMCDXCIX -> 2499
‚úÖ 2999 -> MMCMXCIX -> 2999
‚úÖ 3499 -> MMMCDXCIX -> 3499
‚úÖ 3999 -> MMMCMXCIX -> 3999

üìä ROUND-TRIP ACCURACY: 100.0%

‚úÖ ALL TEST RESULTS:
         Task  Accuracy    Time_s  Memory_KB Complexity  Maintainability  \
0       task4       1.0  0.000057   0.281250          4        63.421356   
1     task4ai       1.0  0.000076   0.296875     

In [6]:
df_task0=pd.read_csv('task0_results.csv')
df_task0.pop('File')
df_task1=pd.read_csv('task1_results.csv')
df_task1.pop('File')
df_task2=pd.read_csv('task2_results.csv')
df_task2.pop('File')
df_task3=pd.read_csv('task3_results.csv')
df_task3=df_task3.rename(columns={"Task": "Function"})
df_task4=pd.read_csv('task4_results.csv')
df_task4=df_task4.rename(columns={"Task": "Function"})
df_task4 = df_task4[df_task4['Function']!= 'round_trip']
dfs = [df_task0, df_task1, df_task2, df_task3, df_task4]
df_all = pd.concat(dfs, ignore_index=True)

In [7]:
df_all.to_csv("df_all.csv", index=False)