**Last Step**
---
Improving accuracy with a system that takes the most frequent value among five predictions made using five different models.





In [10]:

import pandas as pd
from collections import Counter

def merge_and_vote_for_difficulty(file_paths):
    """
    Merges multiple CSV files based on the 'id' column and performs a voting
    mechanism to find the most common value in the 'difficulty' column for each id.

    Parameters:
    file_paths (list): List of file paths to the CSV files.

    Returns:
    pd.DataFrame: A dataframe with the result of the voting process, including
                  only 'id' and 'most_common_difficulty'.
    """
    # Reading the CSV files into dataframes
    dfs = [pd.read_csv(path) for path in file_paths]

    # Merging dataframes on 'id' column, keeping only 'difficulty' columns
    merged_df = dfs[0][['id', 'difficulty']]
    for df in dfs[1:]:
        merged_df = merged_df.merge(df[['id', 'difficulty']], on='id', suffixes=('', '_drop'))

    # Function to determine the most common difficulty for each row
    def most_common_difficulty(row):
        row = row[1:]  # Skipping the 'id' column
        counter = Counter(row)
        most_common = counter.most_common(1)[0][0]
        return most_common

    # Applying the function to each row
    merged_df['difficulty'] = merged_df.apply(most_common_difficulty, axis=1)

    # Returning only 'id' and 'most_common_difficulty'
    return merged_df[['id', 'difficulty']]


if __name__ == "__main__":
    # File paths to the CSV files
    file_paths = [
        '/content/flaubert_base_predictions.csv',
        '/content/flaubert_large_predictions.csv',
        '/content/camembert_base_predictions.csv',
        '/content/camembert_large_predictions.csv',
        '/content/base-french-europeana-cased_predictions.csv'
    ]

    # Performing the merge and vote process
    result_df = merge_and_vote_for_difficulty(file_paths)

    # Saving the result to a new CSV file
    result_df.to_csv('finalprediciton.csv', index=False)

    print("Output saved to 'finalprediction.csv'")


Output saved to 'finalprediction.csv'
