In [10]:
# Adjust the cleaning and calculation functions to handle invalid or single-value ranges properly
df = pd.read_csv('noChoices_range_only.csv')
def clean_range_adjusted(s):
    # Remove units and symbols, keeping only numbers, periods, and hyphens
    return ''.join(filter(lambda x: x.isdigit() or x in ['.', '-'], s))

# Adjust the calculation functions to handle cases where the data may not form a valid range
def percentage_overlap_adjusted(answer, response):
    try:
        a_start, a_end = map(float, answer.split('-'))
        r_start, r_end = map(float, response.split('-'))
        overlap_start = max(a_start, r_start)
        overlap_end = min(a_end, r_end)
        if overlap_end <= overlap_start:
            return 0
        overlap_length = overlap_end - overlap_start
        answer_length = a_end - a_start
        percentage = (overlap_length / answer_length) * 100
        return percentage
    except ValueError:  # Handles cases with invalid ranges
        return None

def jaccard_index_adjusted(answer, response):
    try:
        a_start, a_end = map(float, answer.split('-'))
        r_start, r_end = map(float, response.split('-'))
        overlap_start = max(a_start, r_start)
        overlap_end = min(a_end, r_end)
        overlap_length = max(0, overlap_end - overlap_start)
        union_start = min(a_start, r_start)
        union_end = max(a_end, r_end)
        union_length = union_end - union_start
        jaccard = overlap_length / union_length
        return jaccard
    except ValueError:  # Handles cases with invalid ranges
        return None

# Clean the ranges with the adjusted function
df['answer_cleaned'] = df['answer'].astype(str).apply(clean_range_adjusted)
df['response_cleaned'] = df['response'].astype(str).apply(clean_range_adjusted)


# Apply the adjusted overlap and Jaccard index functions
df['percentage_overlap'] = df.apply(lambda x: percentage_overlap_adjusted(x['answer_cleaned'], x['response_cleaned']), axis=1)
df['jaccard_index'] = df.apply(lambda x: jaccard_index_adjusted(x['answer_cleaned'], x['response_cleaned']), axis=1)






In [11]:

def sorensen_dice_coefficient(answer, response):
    try:
        a_start, a_end = map(float, answer.split('-'))
        r_start, r_end = map(float, response.split('-'))
        overlap_start = max(a_start, r_start)
        overlap_end = min(a_end, r_end)
        overlap_length = max(0, overlap_end - overlap_start)
        answer_length = a_end - a_start
        response_length = r_end - r_start
        # Sørensen-Dice Coefficient calculation
        if answer_length + response_length == 0:  # Avoid division by zero
            return None
        dice_coefficient = (2 * overlap_length) / (answer_length + response_length)
        return dice_coefficient
    except ValueError:  # Handles cases with invalid ranges
        return None

def overlap_coefficient(answer, response):
    try:
        a_start, a_end = map(float, answer.split('-'))
        r_start, r_end = map(float, response.split('-'))
        overlap_start = max(a_start, r_start)
        overlap_end = min(a_end, r_end)
        overlap_length = max(0, overlap_end - overlap_start)
        answer_length = a_end - a_start
        response_length = r_end - r_start
        # Overlap Coefficient calculation
        min_length = min(answer_length, response_length)
        if min_length == 0:  # Avoid division by zero
            return None
        overlap_coeff = overlap_length / min_length
        return overlap_coeff
    except ValueError:  # Handles cases with invalid ranges
        return None

# Apply the new calculations to the dataframe
df['sorensen_dice_coefficient'] = df.apply(lambda x: sorensen_dice_coefficient(x['answer_cleaned'], x['response_cleaned']), axis=1)
df['overlap_coefficient'] = df.apply(lambda x: overlap_coefficient(x['answer_cleaned'], x['response_cleaned']), axis=1)


In [12]:
# Save the corrected dataframe to a new CSV file
corrected_csv_path = 'range_only_analysis_corrected.csv'
df.to_csv(corrected_csv_path, index=False)

corrected_csv_path

'range_only_analysis_corrected.csv'