In [2]:
"""Statistical Debugging in action.

This provided, working code calculates phi coefficients for each code line."""

import sys
import math
import linecache

# The buggy program
def remove_html_markup(s):
    tag   = False
    quote = False
    out   = ""

    for c in s:

        if c == '<' and not quote:
            tag = True
        elif c == '>' and not quote:
            tag = False
        elif c == '"' or c == "'" and tag:
            quote = not quote
        elif not tag:
            out = out + c

    return out


# global variable to keep the coverage data in
coverage = {}


def traceit(frame, event, arg):
    """Tracing function that saves the coverage data."""
    global coverage

    if event == "line":
        filename = frame.f_code.co_filename
        lineno   = frame.f_lineno
        if not coverage.has_key(filename):
            coverage[filename] = {}
        coverage[filename][lineno] = True
        
    return traceit

            
def phi(n11, n10, n01, n00):
    """Calculate phi coefficient from given values."""
    return ((n11 * n00 - n10 * n01) / 
             math.sqrt((n10 + n11) * (n01 + n00) * (n10 + n00) * (n01 + n11)))


def print_tables(tables):
    """Print out values of phi, and result of runs for each covered line."""
    for filename in tables.keys():
        for i in range(10, 27):     # lines of the remove_html_markup in this file
            if tables[filename].has_key(i):
                (n11, n10, n01, n00) = tables[filename][i]
                try:
                    factor = phi(n11, n10, n01, n00)
                    prefix = "%+.4f%2d%2d%2d%2d" % (factor, n11, n10, n01, n00)
                except:
                    prefix = "       %2d%2d%2d%2d" % (n11, n10, n01, n00)
                    
            else:
                prefix = "               "
                    
            print prefix, linecache.getline(filename, i),
                            

def run_tests(inputs):
    """Run the program with each test case.
    Record and return input, outcome and coverage of lines."""
    global coverage
    runs   = []
    for input in inputs:
        coverage = {}
        sys.settrace(traceit)
        result = remove_html_markup(input)
        sys.settrace(None) 
        if result.find('<') == -1:
            outcome = "PASS"
        else:
            outcome = "FAIL"
        runs.append((input, outcome, coverage))
    return runs


def init_tables(runs):
    """Create empty tuples for each covered line."""
    tables = {}
    for (input, outcome, coverage) in runs:
        for filename, lines in coverage.iteritems():
            for line in lines.keys():
                if not tables.has_key(filename):
                    tables[filename] = {}
                if not tables[filename].has_key(line):
                    tables[filename][line] = (0, 0, 0, 0)
    return tables


def compute_n(tables):
    """Compute n11, n10, etc. for each line."""
    for filename, lines in tables.iteritems():
        for line in lines.keys():
            (n11, n10, n01, n00) = tables[filename][line]
            for (input, outcome, coverage) in runs:
                if coverage.has_key(filename) and coverage[filename].has_key(line):
                    # Covered in this run
                    if outcome == "FAIL":
                        n11 += 1  # covered and fails
                    else:
                        n10 += 1  # covered and passes
                else:
                    # Not covered in this run
                    if outcome == "FAIL":
                        n01 += 1  # uncovered and fails
                    else:
                        n00 += 1  # uncovered and passes
            tables[filename][line] = (n11, n10, n01, n00)
    return tables


# These are the test cases          
inputs_line = ['foo', 
          '<b>foo</b>', 
          '"<b>foo</b>"', 
          '"foo"', 
          "'foo'", 
          '<em>foo</em>', 
          '<a href="foo">foo</a>',
          '""',
          "<p>"]


# Now compute and report Phi for each line. The higher the value,
# the more likely the line is the cause of the failures.
runs = run_tests(inputs_line)

tables = init_tables(runs)
tables = compute_n(tables)

print_tables(tables)

                def remove_html_markup(s):
        1 8 0 0     tag   = False
        1 8 0 0     quote = False
        1 8 0 0     out   = ""
                
        1 8 0 0     for c in s:
                
        1 8 0 0         if c == '<' and not quote:
-0.3162 0 4 1 4             tag = True
        1 8 0 0         elif c == '>' and not quote:
-0.3162 0 4 1 4             tag = False
        1 8 0 0         elif c == '"' or c == "'" and tag:
+0.3953 1 3 0 5             quote = not quote
+0.1250 1 7 0 1         elif not tag:
+0.1890 1 6 0 2             out = out + c
                
        1 8 0 0     return out
