In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

def evaluate_predictions(raw_file, output_file):
    # Read raw test data
    raw_df = pd.read_csv(raw_file)
    raw_df.set_index('tweet_id', inplace=True)

    # Read output test data
    output_df = pd.read_csv(output_file)
    output_df.set_index('tweet_id', inplace=True)

    # Join dataframes on tweet_id
    merged_df = raw_df.join(output_df, how='inner', lsuffix='_raw', rsuffix='_output')

    # Drop rows with NaN values in the output label
    merged_df.dropna(subset=['class_label_output'], inplace=True)

    # Convert labels in the output file to match the case of labels in the input file
    merged_df['class_label_raw'] = merged_df['class_label_raw'].apply(lambda x: x.lower())

    # Convert labels in the output file to match the case of labels in the input file
    # merged_df['class_label_output'] = merged_df['class_label_output'].str.rstrip('.')


    # Calculate metrics
    accuracy = accuracy_score(merged_df['class_label_raw'], merged_df['class_label_output'])
    precision = precision_score(merged_df['class_label_raw'], merged_df['class_label_output'], pos_label='yes',average='binary', zero_division=0)
    f1 = f1_score(merged_df['class_label_raw'], merged_df['class_label_output'], pos_label='yes',average='binary', zero_division=0)
    recall = recall_score(merged_df['class_label_raw'], merged_df['class_label_output'], pos_label='yes',average='binary', zero_division=0)

    return accuracy, precision, f1, recall

# Example usage
raw_file = 'test_preprocessed.csv'
output_file = 'output.csv'
accuracy, precision, f1, recall = evaluate_predictions(raw_file, output_file)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)
print("Recall Score:", recall)
