# Modules & Functions

In [1]:
import random
import numpy as np
import pandas as pd
from itertools import combinations
from scipy.stats import entropy
from numpy import linalg as LA
from itertools import combinations
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import stats

pd.options.mode.chained_assignment = None

# Main function to select the aggregation method
def compute_aggregated_answer(df, method):
    
    # Aggregation weighted by Certitude
    def weighted_by_certitude(group):
        weights = {1: 1/5, 
                   2: 2/5, 
                   3: 3/5, 
                   4: 4/5, 
                   5: 1}
        answers = group['Answer'].unique()
        max_weighted_answer = None
        max_weight = -np.inf
        for answer in answers:
            subset = group[group['Answer'] == answer]
            weight_sum = (1 * subset['Certitude'].map(weights)).sum()
            if weight_sum > max_weight:
                max_weight = weight_sum
                max_weighted_answer = answer
        return max_weighted_answer

    # Aggregation weighted by Certitude and Expertise
    def weighted_by_certitude_and_expertise(group):
        answers = group['Answer'].unique()
        max_weighted_answer = None
        max_weight = -np.inf
        for answer in answers:
            subset = group[group['Answer'] == answer]
            weight_sum = ((subset['Certitude'] + subset['Chemist Level']) / 10).sum()
            if weight_sum > max_weight:
                max_weight = weight_sum
                max_weighted_answer = answer
        return max_weighted_answer

    def borda_count(group):
        answers = group['Answer'].unique()
        borda_scores = {}
        for answer in answers:
            subset = group[group['Answer'] == answer]
            borda_scores[answer] = subset['Certitude'].sum()
        return max(borda_scores, key=borda_scores.get)

    def log_odds(group):
        answers = group['Answer'].unique()
        log_odds_scores = {}
        for answer in answers:
            subset = group[group['Answer'] == answer]
            log_odds_scores[answer] = np.log(subset['Certitude']).sum()
        return max(log_odds_scores, key=log_odds_scores.get)

    # Aggregation weighted by Expertise
    def weighted_by_expertise(group):
        weights = {1: 1/5, 2: 2/5, 3: 3/5, 4: 4/5, 5: 1}
        answers = group['Answer'].unique()
        max_weighted_answer = None
        max_weight = -np.inf
        for answer in answers:
            subset = group[group['Answer'] == answer]
            weight_sum = (1 * subset['Chemist Level'].map(weights)).sum()
            if weight_sum > max_weight:
                max_weight = weight_sum
                max_weighted_answer = answer
        return max_weighted_answer

    def entropy_weighted(group):
        answers = group['Answer'].unique()
        max_weighted_answer = None
        min_entropy = np.inf
        for answer in answers:
            subset = group[group['Answer'] == answer]
            # Compute entropy based on the 'Certitude' column
            e = entropy(subset['Certitude'].value_counts(normalize=True))
            if e < min_entropy:
                min_entropy = e
                max_weighted_answer = answer
        return max_weighted_answer

    # Wisdom of the Crowd (WoC)
    def wisdom_of_crowd(group):
        answer_mapping = {'A': 1, 'B': 3, 'C': 5}
        reverse_mapping = {1: 'A', 3: 'B', 5: 'C'}
        numerical_answers = group['Answer'].map(answer_mapping)
        mean_value = numerical_answers.mean()
        closest_answer = min(answer_mapping.values(), key=lambda x:abs(x-mean_value))
        return reverse_mapping[closest_answer]

    # Monte Carlo Simulation
    def monte_carlo_simulation(group):
        answer_mapping = {'A': 1, 'B': 3, 'C': 5}
        reverse_mapping = {1: 'A', 3: 'B', 5: 'C'}
        numerical_answers = group['Answer'].map(answer_mapping)
        samples = np.random.choice(numerical_answers, size=1000, replace=True)
        mean_value = np.mean(samples)
        closest_answer = min(answer_mapping.values(), key=lambda x:abs(x-mean_value))
        return reverse_mapping[closest_answer]

    def eigenvalue_aggregation(group):
        answers = group['Answer'].unique()
        matrix = np.zeros((len(answers), len(answers)))
        for i, answer_i in enumerate(answers):
            for j, answer_j in enumerate(answers):
                if i == j:
                    matrix[i, j] = len(group[group['Answer'] == answer_i])
                else:
                    matrix[i, j] = 0  # You can customize this based on your specific needs
        eigenvalues, eigenvectors = LA.eig(matrix)
        principal_eigenvector = eigenvectors[:, np.argmax(eigenvalues)]
        aggregated_answer = answers[np.argmax(np.abs(principal_eigenvector))]
        return aggregated_answer

    def fuzzy_logic_aggregation(group):
        answers = group['Answer'].unique()
        degrees = {}
        for answer in answers:
            degrees[answer] = np.mean(group[group['Answer'] == answer]['Certitude']) / 5.0  # Normalize to [0, 1]

        aggregated_answer = max(degrees, key=degrees.get)
        return aggregated_answer


    def weighted_most_frequent(group):
        answers = group['Answer'].unique()
        max_weighted_answer = None
        max_weight = -np.inf
        for answer in answers:
            subset = group[group['Answer'] == answer]
            weight_sum = len(subset)
            if weight_sum >= max_weight:
                max_weight = weight_sum
                max_weighted_answer = answer
        return max_weighted_answer
    
    if method == "most_frequent":
        group =  df.groupby(['Chemist Group', 'Question']).apply(weighted_most_frequent).reset_index(name='Most_Frequent_Answer')
    else:
        if method == "weighted_by_certitude":
            group =  df.groupby(['Chemist Group', 'Question']).apply(weighted_by_certitude).reset_index(name='Most_Frequent_Answer')
        else:
            if method == "weighted_by_certitude_and_expertise":
                group =  df.groupby(['Chemist Group', 'Question']).apply(weighted_by_certitude_and_expertise).reset_index(name='Most_Frequent_Answer')
            else:
                if method == "weighted_by_expertise":
                    group =  df.groupby(['Chemist Group', 'Question']).apply(weighted_by_expertise).reset_index(name='Most_Frequent_Answer')
                else:
                    if method == "borda_count":
                        group =  df.groupby(['Chemist Group', 'Question']).apply(borda_count).reset_index(name='Most_Frequent_Answer')
                    else:
                        if method == "log_odds":
                            group =  df.groupby(['Chemist Group', 'Question']).apply(log_odds).reset_index(name='Most_Frequent_Answer')
                        else:
                            if method == "entropy_weighted":
                                group =  df.groupby(['Chemist Group', 'Question']).apply(entropy_weighted).reset_index(name='Most_Frequent_Answer')
                            else:
                                if method == "wisdom_of_crowd":
                                    group =  df.groupby(['Chemist Group', 'Question']).apply(wisdom_of_crowd).reset_index(name='Most_Frequent_Answer')
                                else:
                                    if method == "monte_carlo_simulation":
                                        group =  df.groupby(['Chemist Group', 'Question']).apply(monte_carlo_simulation).reset_index(name='Most_Frequent_Answer')
                                    else:
                                        if method == "eigenvalue_aggregation":
                                            group =  df.groupby(['Chemist Group', 'Question']).apply(eigenvalue_aggregation).reset_index(name='Most_Frequent_Answer')
                                        else:
                                            if method == "fuzzy_logic_aggregation":
                                                group =  df.groupby(['Chemist Group', 'Question']).apply(fuzzy_logic_aggregation).reset_index(name='Most_Frequent_Answer')

    group = group.merge(df[['Question', 'Correct_Answer']].drop_duplicates(), on='Question', how='left')
    group['Most_Frequent_Correct'] = (group['Most_Frequent_Answer'] == group['Correct_Answer']).astype(int)
    
    return group


def _merge_data_CI(CI_s1_path, CI_s2_path, CI_structures_path):

    df_A = pd.read_csv(CI_structures_path, sep=',')
    df_B = pd.read_csv(CI_s1_path, sep = ";")
    df_B_transformed = transform_dataset_B_v2(df_B.copy()) 

    # Melting the first dataframe
    df1_melted = df_B_transformed.melt(id_vars=['Chemist Level', 'Chemist'], value_vars=[col for col in df_B_transformed.columns if 'Answer Q' in col], var_name='Question', value_name='Answer')
    df1_melted['Slide_ID'] = df1_melted['Question'].str.extract('(\d+)').astype(float)
    merged_df = df1_melted.merge(df_A, on='Slide_ID', how='left')
    df1_melted_cert = df_B_transformed.melt(id_vars=['Chemist Level', 'Chemist'], value_vars=[col for col in df_B_transformed.columns if 'Certitude Q' in col], var_name='Question', value_name='Certitude')
    df1_melted_cert['Slide_ID'] = df1_melted_cert['Question'].str.extract('(\d+)').astype(float)
    merged_with_cert = merged_df.merge(df1_melted_cert[['Chemist', 'Chemist Level', 'Slide_ID', 'Certitude']], on=['Chemist', 'Chemist Level', 'Slide_ID'], how='left')
    merged_with_cert = merged_with_cert.dropna().rename(columns = {"Certitude":"Answer", "Answer":"Certitude"})
    merged_with_cert.to_csv('../data/CI_Answer_A.csv', index = None)

    df_B = pd.read_csv(CI_s2_path, sep = ";")
    df_B_transformed = transform_dataset_B_v2(df_B.copy()) 
    col = []
    for c in df_B_transformed.columns.tolist():

        if "Q" in c:
            if "Answer" in c:
                col.append("Answer Q"+ str(int(c.replace("Answer Q", ""))+37))
            if "Certitude" in c:
                col.append("Certitude Q"+ str(int(c.replace("Certitude Q", ""))+37))
        else:
            col.append(c)

    df_B_transformed.columns = col
    df1_melted = df_B_transformed.melt(id_vars=['Chemist Level', 'Chemist'], value_vars=[col for col in df_B_transformed.columns if 'Answer Q' in col], var_name='Question', value_name='Answer')
    df1_melted['Slide_ID'] = df1_melted['Question'].str.extract('(\d+)').astype(float)
    merged_df = df1_melted.merge(df_A, on='Slide_ID', how='left')
    df1_melted_cert = df_B_transformed.melt(id_vars=['Chemist Level', 'Chemist'], value_vars=[col for col in df_B_transformed.columns if 'Certitude Q' in col], var_name='Question', value_name='Certitude')
    df1_melted_cert['Slide_ID'] = df1_melted_cert['Question'].str.extract('(\d+)').astype(float)
    merged_with_cert = merged_df.merge(df1_melted_cert[['Chemist', 'Chemist Level', 'Slide_ID', 'Certitude']], on=['Chemist', 'Chemist Level', 'Slide_ID'], how='left')
    merged_with_cert = merged_with_cert.dropna().rename(columns = {"Certitude":"Answer", "Answer":"Certitude"})
    merged_with_cert.to_csv('../data/CI_Answer_B.csv', index = None)

    df_A = pd.read_csv('../data/CI_Answer_A.csv', sep=',')
    df_B = pd.read_csv('../data/CI_Answer_B.csv', sep=',')
    df_B = remove_consistent_chemists(df_B)
    df_A = remove_consistent_chemists(df_A)
    df_A.to_csv('../data/CI_Answer_A.csv', index = None)
    df_B.to_csv('../data/CI_Answer_B.csv', index = None)
    
    # Creates new column 
    df_A['Result'] = np.where(df_A['Correct_Answer'] == df_A['Answer'], 1, 0)
    df_B['Result'] = np.where(df_B['Correct_Answer'] == df_B['Answer'], 1, 0)
    combined_df = pd.concat([df_A, df_B])
    combined_df = combined_df[combined_df['Certitude'].isna()!=True]
    df_A = pd.read_csv('../data/CI_Answer_A.csv', sep=',')
    df_B = pd.read_csv('../data/CI_Answer_B.csv', sep=',')
    df_A['Result'] = np.where(df_A['Correct_Answer'] == df_A['Answer'], 1, 0)
    df_B['Result'] = np.where(df_B['Correct_Answer'] == df_B['Answer'], 1, 0)
    combined_scores = pd.concat([compute_scores(df_A), compute_scores(df_B)], ignore_index=True)
    combined_scores_all = combined_scores.copy()
    combined_scores_all["Chemist Level"] = 6
    combined_scores = pd.concat([combined_scores_all, combined_scores])
    
    df_A_all = df_A.copy()
    df_A_all["Chemist Level"] = 6
    df_A = pd.concat([df_A_all, df_A])
    df_B_all = df_B.copy()
    df_B_all["Chemist Level"] = 6
    df_B = pd.concat([df_B_all, df_B])
    df_A["Chemist Group"] = df_A["Chemist Level"].apply(assign_chemist_group)
    df_B["Chemist Group"] = df_B["Chemist Level"].apply(assign_chemist_group)
    combined_scores["Chemist Group"] = combined_scores["Chemist Level"].apply(assign_chemist_group)
    most_frequent_combined = pd.concat([compute_most_frequent(df_A, False), compute_most_frequent(df_B, False)])
    most_frequent_combined = most_frequent_combined[most_frequent_combined["Chemist Group"]==3]
    most_frequent_combined = most_frequent_combined[["Question","Most_Frequent_Answer"]]
    most_frequent_combined.columns = ["Slide_ID","Most_Frequent_Answer"]
    most_frequent_combined["Slide_ID"] = [i.split("Q")[-1] for i in most_frequent_combined["Slide_ID"].tolist()]
    most_frequent_combined.to_csv("./model_mmpdb/CI_Answer_v3-Response_Most_Frequent.csv", index = False)
    return(combined_df)


def compute_scores(df):
    score = df.groupby('Chemist')['Result'].mean().reset_index()
    score.columns = ['Chemist', 'Score']
    score = score.merge(df[['Chemist', 'Chemist Group', 'Certitude']].drop_duplicates(), on='Chemist')
    return score


def assign_chemist_group(level):
    if level < 3:
        return 1  # non-expert
    else:
        if level >= 3 and level <= 5:
            return 2  # expert
        else:
            return 3  # all
    
    
def chemist_group_analysis(df_A, df_B, assign_chemist_group, compute_most_frequent, method_agg):
    """
    Conducts analysis on chemist groups and returns aggregated statistics.
    
    Parameters:
    - df_A: DataFrame containing data for chemist group A
    - df_B: DataFrame containing data for chemist group B
    - assign_chemist_group: Function to assign chemist group based on chemist level
    - compute_most_frequent: Function to compute most frequent chemists
    
    Returns:
    - subgroups_dict: Dictionary containing statistics for each subgroup
    - combined_df: DataFrame containing combined statistics
    """
    
    df_A_all = df_A.copy()
    df_A_all["Chemist Level"] = 6
    df_A = pd.concat([df_A_all, df_A])
    df_B_all = df_B.copy()
    df_B_all["Chemist Level"] = 6
    df_B = pd.concat([df_B_all, df_B])


    subgroups_dict = {}
    for i in tqdm(range(1, 94)):
        kok = []
        
        unique_levels = [1, 2, 3]
        for level in unique_levels:
            df_A_dg = df_A.copy()
            if level < 3:
                df_A_dg = df_A_dg[df_A_dg["Chemist Level"] < 6]
            else:
                df_A_dg = df_A_dg[df_A_dg["Chemist Level"] == 6]
            df_A_dg["Chemist Group"] = df_A_dg["Chemist Level"].apply(assign_chemist_group)
            df_A_dg = df_A_dg.drop_duplicates('Chemist')
            df_B_dg = df_B.copy()
            if level < 3:
                df_B_dg = df_B_dg[df_B_dg["Chemist Level"] < 6]
            else:
                df_B_dg = df_B_dg[df_B_dg["Chemist Level"] == 6]
            df_B_dg["Chemist Group"] = df_B_dg["Chemist Level"].apply(assign_chemist_group)
            df_B_dg = df_B_dg.drop_duplicates('Chemist')
            selection_df_A = df_A_dg[df_A_dg['Chemist Group'] == level]
            selection_df_B = df_B_dg[df_B_dg['Chemist Group'] == level]
            all_numbers = {"A": list(set(list(selection_df_A['Chemist']))),
                           "B": list(set(list(selection_df_B['Chemist'])))}
            if i >= len(all_numbers["A"]) :
                max_k_A = len(all_numbers["A"])
            else:
                max_k_A = i
            if i >= len(all_numbers["B"]) :
                max_k_B = len(all_numbers["B"])
            else:
                max_k_B = i
            for j in range(min([max_k_A, max_k_B])):
                chemist_comb_A = random.sample(all_numbers["A"], max_k_A)
                chemist_comb_B = random.sample(all_numbers["B"], max_k_B)
                df_A_sb = df_A[df_A['Chemist'].isin(chemist_comb_A)]
                df_B_sb = df_B[df_B['Chemist'].isin(chemist_comb_B)]
                df_A_sb["Chemist Group"] = df_A_sb["Chemist Level"].apply(assign_chemist_group)
                df_B_sb["Chemist Group"] = df_B_sb["Chemist Level"].apply(assign_chemist_group)
                df_A_sb = df_A_sb[df_A_sb["Chemist Group"]==level]
                df_B_sb = df_B_sb[df_B_sb["Chemist Group"]==level]
                most_frequent_combined = pd.concat([compute_aggregated_answer(df_A_sb, method_agg), compute_aggregated_answer(df_B_sb, method_agg)]).groupby('Chemist Group')['Most_Frequent_Correct'].mean().reset_index(name='SR')
                kok.append(most_frequent_combined)
        
        result_k = pd.concat(kok).groupby('Chemist Group')['SR'].agg(
            mean='mean',
            std='std'
        ).reset_index()
        result_k['25th_Percentile_SR'] = result_k['mean'] - result_k['std']
        result_k['75th_Percentile_SR'] = result_k['mean'] + result_k['std']
        result_k.drop(columns=['std'], inplace=True)
        result_k.columns = ['Chemist Group', 'Mean_SR', '25th_Percentile_SR', '75th_Percentile_SR']
        subgroups_dict[str(i)] = result_k

    frames = []
    for key, df in subgroups_dict.items():
        new_df = df.copy()
        new_df['Key'] = key
        frames.append(new_df)
    combined_df = pd.concat(frames, ignore_index=True)

    df_n = pd.DataFrame()  # Modification
    df_n["Chemist Group"] = [1, 2, 3]
    df_n["Mean_SR"] = [0.378, 0.486, 0.432]
    df_n["Key"] = [1, 1, 1]
    
    df_t = pd.concat([df_n, combined_df])
    df_t.to_csv(f"./data/CollectiveIntelligence/Evolutive_CI_{method_agg}.csv", index=None)
    
    return subgroups_dict, combined_df


def chemist_group_analysis_admet(df_A, df_B, assign_chemist_group, compute_most_frequent, method_agg, endpoint_admet):
    """
    Conducts analysis on chemist groups and returns aggregated statistics.
    
    Parameters:
    - df_A: DataFrame containing data for chemist group A
    - df_B: DataFrame containing data for chemist group B
    - assign_chemist_group: Function to assign chemist group based on chemist level
    - compute_most_frequent: Function to compute most frequent chemists
    
    Returns:
    - subgroups_dict: Dictionary containing statistics for each subgroup
    - combined_df: DataFrame containing combined statistics
    """
    
    df_A_all = df_A.copy()
    df_A_all["Chemist Level"] = 6
    df_A = pd.concat([df_A_all, df_A])
    df_B_all = df_B.copy()
    df_B_all["Chemist Level"] = 6
    df_B = pd.concat([df_B_all, df_B])

    df_A = df_A[df_A["Endpoint"]==endpoint_admet]
    df_B = df_B[df_B["Endpoint"]==endpoint_admet]

    subgroups_dict = {}
    for i in tqdm(range(3, 94)):
        kok = []
        unique_levels = [1, 2, 3]
        for level in unique_levels:
            df_A_dg = df_A.copy()
            if level < 3:
                df_A_dg = df_A_dg[df_A_dg["Chemist Level"] < 6]
            else:
                df_A_dg = df_A_dg[df_A_dg["Chemist Level"] == 6]
            df_A_dg["Chemist Group"] = df_A_dg["Chemist Level"].apply(assign_chemist_group)
            df_A_dg = df_A_dg.drop_duplicates('Chemist')
            df_B_dg = df_B.copy()
            if level < 3:
                df_B_dg = df_B_dg[df_B_dg["Chemist Level"] < 6]
            else:
                df_B_dg = df_B_dg[df_B_dg["Chemist Level"] == 6]
            df_B_dg["Chemist Group"] = df_B_dg["Chemist Level"].apply(assign_chemist_group)
            df_B_dg = df_B_dg.drop_duplicates('Chemist')
            selection_df_A = df_A_dg[df_A_dg['Chemist Group'] == level]
            selection_df_B = df_B_dg[df_B_dg['Chemist Group'] == level]
            all_numbers = {"A": list(set(list(selection_df_A['Chemist']))),
                           "B": list(set(list(selection_df_B['Chemist'])))}
            if i >= len(all_numbers["A"]) :
                max_k_A = len(all_numbers["A"])
            else:
                max_k_A = i
            if i >= len(all_numbers["B"]) :
                max_k_B = len(all_numbers["B"])
            else:
                max_k_B = i
            
            for j in range(min([max_k_A, max_k_B])):
                chemist_comb_A = random.sample(all_numbers["A"], max_k_A)
                chemist_comb_B = random.sample(all_numbers["B"], max_k_B)
                df_A_sb = df_A[df_A['Chemist'].isin(chemist_comb_A)]
                df_B_sb = df_B[df_B['Chemist'].isin(chemist_comb_B)]
                df_A_sb["Chemist Group"] = df_A_sb["Chemist Level"].apply(assign_chemist_group)
                df_B_sb["Chemist Group"] = df_B_sb["Chemist Level"].apply(assign_chemist_group)
                df_A_sb = df_A_sb[df_A_sb["Chemist Group"]==level]
                df_B_sb = df_B_sb[df_B_sb["Chemist Group"]==level]
                most_frequent_combined = pd.concat([compute_aggregated_answer(df_A_sb, method_agg), compute_aggregated_answer(df_B_sb, method_agg)]).groupby('Chemist Group')['Most_Frequent_Correct'].mean().reset_index(name='SR')
                kok.append(most_frequent_combined)        

        # Aggregating with the new method
        result_k = pd.concat(kok).groupby('Chemist Group')['SR'].agg(
            mean='mean',
            std='std'
        ).reset_index()
        # Adjusting 25th and 75th percentiles to be mean - std and mean + std
        result_k['25th_Percentile_SR'] = result_k['mean'] - result_k['std']
        result_k['75th_Percentile_SR'] = result_k['mean'] + result_k['std']
        # Drop the std column if it's not needed
        result_k.drop(columns=['std'], inplace=True)
        # Renaming the columns for clarity
        result_k.columns = ['Chemist Group', 'Mean_SR', '25th_Percentile_SR', '75th_Percentile_SR']
        # Storing the result in the subgroups dictionary
        subgroups_dict[str(i)] = result_k

    frames = []
    for key, df in subgroups_dict.items():
        new_df = df.copy()
        new_df['Key'] = key
        frames.append(new_df)

    combined_df = pd.concat(frames, ignore_index=True)
    
    dict_med = {  # MODIFICATION
    "Solubility":
        {"Non-Expert":0.34,
         "Expert":0.5,
         "All":0.38
        },
    "Permeability":
        {"Non-Expert":0.42,
         "Expert":0.5,
         "All":0.41
        },
    "LogP":
        {"Non-Expert":0.41,
         "Expert":0.84,
         "All":0.5
        },
    "LogD":
        {"Non-Expert":0.33,
         "Expert":0.45,
         "All":0.33
        },
    "hERG":
        {"Non-Expert":0.325,
         "Expert":0.43,
         "All":0.33
        }
    }
    
    df_n = pd.DataFrame()
    df_n["Chemist Group"] = [1, 2, 3]
    df_n["Mean_SR"] = [dict_med[endpoint_admet]["Non-Expert"], dict_med[endpoint_admet]["Expert"], dict_med[endpoint_admet]["All"]]
    df_n["Key"] = [1, 1, 1]
    df_t = pd.concat([df_n, combined_df])
    df_t.to_csv(f"./data/CollectiveIntelligence/Evolutive_CI_ADMET_{endpoint_admet}_{method_agg}___STD.csv", index=None)

    return subgroups_dict, combined_df


def confidence_interval(x):
    """Calculate the 95% confidence interval for a given array of values."""
    mean_x = np.mean(x)
    sem = stats.sem(x)
    return stats.t.interval(0.95, len(x)-1, loc=mean_x, scale=sem)


def chemist_group_analysis_group(df_A, df_B, assign_chemist_group, compute_most_frequent, method_agg):
    """
    Conducts analysis on chemist groups and returns aggregated statistics.
    
    Parameters:
    - df_A: DataFrame containing data for chemist group A
    - df_B: DataFrame containing data for chemist group B
    - assign_chemist_group: Function to assign chemist group based on chemist level
    - compute_most_frequent: Function to compute most frequent chemists
    
    Returns:
    - subgroups_dict: Dictionary containing statistics for each subgroup
    - combined_df: DataFrame containing combined statistics
    """
    
    df_A_all = df_A.copy()
    df_A_all["Chemist Level"] = 6
    df_A = pd.concat([df_A_all, df_A])
    df_B_all = df_B.copy()
    df_B_all["Chemist Level"] = 6
    df_B = pd.concat([df_B_all, df_B])


    subgroups_dict = {}
    for i in tqdm(range(1, 94)):
        kok = []
        unique_levels = [1, 2, 3]
        for level in unique_levels:
            df_A_dg = df_A.copy()
            if level < 3:
                df_A_dg = df_A_dg[df_A_dg["Chemist Level"] < 6]
            else:
                df_A_dg = df_A_dg[df_A_dg["Chemist Level"] == 6]
            df_A_dg["Chemist Group"] = df_A_dg["Chemist Level"].apply(assign_chemist_group)
            df_A_dg = df_A_dg.drop_duplicates('Chemist')
            df_B_dg = df_B.copy()
            if level < 3:
                df_B_dg = df_B_dg[df_B_dg["Chemist Level"] < 6]
            else:
                df_B_dg = df_B_dg[df_B_dg["Chemist Level"] == 6]
            df_B_dg["Chemist Group"] = df_B_dg["Chemist Level"].apply(assign_chemist_group)
            df_B_dg = df_B_dg.drop_duplicates('Chemist')
            selection_df_A = df_A_dg[df_A_dg['Chemist Group'] == level]
            selection_df_B = df_B_dg[df_B_dg['Chemist Group'] == level]
            all_numbers = {"A": list(set(list(selection_df_A['Chemist']))),
                           "B": list(set(list(selection_df_B['Chemist'])))}
            if i >= len(all_numbers["A"]) :
                max_k_A = len(all_numbers["A"])
            else:
                max_k_A = i
            if i >= len(all_numbers["B"]) :
                max_k_B = len(all_numbers["B"])
            else:
                max_k_B = i
            for j in range(min([max_k_A, max_k_B])):
                chemist_comb_A = random.sample(all_numbers["A"], max_k_A)
                chemist_comb_B = random.sample(all_numbers["B"], max_k_B)
                df_A_sb = df_A[df_A['Chemist'].isin(chemist_comb_A)]
                df_B_sb = df_B[df_B['Chemist'].isin(chemist_comb_B)]
                df_A_sb["Chemist Group"] = df_A_sb["Chemist Level"].apply(assign_chemist_group)
                df_B_sb["Chemist Group"] = df_B_sb["Chemist Level"].apply(assign_chemist_group)
                df_A_sb = df_A_sb[df_A_sb["Chemist Group"]==level]
                df_B_sb = df_B_sb[df_B_sb["Chemist Group"]==level]
                most_frequent_combined = pd.concat([compute_aggregated_answer(df_A_sb, method_agg), compute_aggregated_answer(df_B_sb, method_agg)]).groupby('Chemist Group')['Most_Frequent_Correct'].mean().reset_index(name='SR')
                kok.append(most_frequent_combined)

        result_k = pd.concat(kok).groupby('Chemist Group')['SR'].agg(
            mean='mean',
            std='std'
        ).reset_index()
        # Adjusting 25th and 75th percentiles to be mean - std and mean + std
        result_k['25th_Percentile_SR'] = result_k['mean'] - result_k['std']
        result_k['75th_Percentile_SR'] = result_k['mean'] + result_k['std']
        # Drop the std column if it's not needed
        result_k.drop(columns=['std'], inplace=True)
        # Renaming the columns for clarity
        result_k.columns = ['Chemist Group', 'Mean_SR', '25th_Percentile_SR', '75th_Percentile_SR']
        # Storing the result in the subgroups dictionary
        subgroups_dict[str(i)] = result_k

    frames = []
    for key, df in subgroups_dict.items():
        new_df = df.copy()
        new_df['Key'] = key
        frames.append(new_df)
    combined_df = pd.concat(frames, ignore_index=True)

    df_n = pd.DataFrame()
    df_n["Chemist Group"] = [1, 2, 3]
    df_n["Mean_SR"] = [0.378, 0.486, 0.432]
    df_n["Key"] = [1, 1, 1]

    df_t = pd.concat([df_n, combined_df])
    df_t.to_csv(f"./data/CollectiveIntelligence/Evolutive_CI_AM_{method_agg}.csv", index=None)
    
    return subgroups_dict, combined_df

  from .autonotebook import tqdm as notebook_tqdm


# Data Preparation

In [None]:
all_agg = ["most_frequent", "weighted_by_certitude", "weighted_by_certitude_and_expertise", "weighted_by_expertise", "log_odds", "fuzzy_logic_aggregation"]
all_admet = ["Solubility", "Permeability", "hERG", "LogD", "LogP"]

CI_s1_path = './data/CollectiveIntelligence/CI_Session_2_p1.csv'
CI_s2_path = './data/CollectiveIntelligence/CI_Session_2_p2.csv'
CI_structures_path = './data/CollectiveIntelligence/CI_Answer_v2-Structures.csv'

In [3]:
combined_df = _merge_data_CI(CI_s1_path, CI_s2_path, CI_structures_path)
df_A = pd.read_csv('../data/CollectiveIntelligence/CI_Answer_A.csv', sep=',')
df_B = pd.read_csv('../data/CollectiveIntelligence/CI_Answer_B.csv', sep=',')

# Run CI simulation

- Per Aggregation method

In [6]:
for weigth in all_agg:
    subgroups_dict, combined_df = chemist_group_analysis(df_A, df_B, assign_chemist_group, compute_most_frequent, weigth)

most_frequent


100%|██████████| 93/93 [12:30<00:00,  8.07s/it]


weighted_by_certitude


100%|██████████| 93/93 [29:55<00:00, 19.31s/it]


weighted_by_certitude_and_expertise


100%|██████████| 93/93 [21:49<00:00, 14.08s/it]


weighted_by_expertise


100%|██████████| 93/93 [30:04<00:00, 19.40s/it]


log_odds


100%|██████████| 93/93 [18:24<00:00, 11.88s/it]


fuzzy_logic_aggregation


100%|██████████| 93/93 [16:03<00:00, 10.36s/it]


- Per ADMET

In [7]:
for weigth in tqdm(all_agg):
    for endpoint_admet in tqdm(all_admet):
        print(weigth)
        subgroups_dict, combined_df = chemist_group_analysis_admet(df_A, df_B, assign_chemist_group, compute_most_frequent, weigth, endpoint_admet)

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A

most_frequent




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:19,  4.67it/s][A[A

  2%|▏         | 2/91 [00:00<00:22,  3.89it/s][A[A

  3%|▎         | 3/91 [00:00<00:31,  2.78it/s][A[A

  4%|▍         | 4/91 [00:01<00:36,  2.39it/s][A[A

  5%|▌         | 5/91 [00:02<00:39,  2.15it/s][A[A

  7%|▋         | 6/91 [00:02<00:44,  1.92it/s][A[A

  8%|▊         | 7/91 [00:03<00:48,  1.72it/s][A[A

  9%|▉         | 8/91 [00:04<00:54,  1.54it/s][A[A

 10%|▉         | 9/91 [00:05<01:03,  1.30it/s][A[A

 11%|█         | 10/91 [00:06<01:07,  1.19it/s][A[A

 12%|█▏        | 11/91 [00:07<01:13,  1.09it/s][A[A

 13%|█▎        | 12/91 [00:08<01:17,  1.01it/s][A[A

 14%|█▍        | 13/91 [00:09<01:24,  1.08s/it][A[A

 15%|█▌        | 14/91 [00:11<01:28,  1.15s/it][A[A

 16%|█▋        | 15/91 [00:12<01:36,  1.28s/it][A[A

 18%|█▊        | 16/91 [00:14<01:41,  1.36s/it][A[A

 19%|█▊        | 17/91 [00:15<01:48,  1.47s/it][A[A

 20%|█▉        | 18/91 [00

most_frequent




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:14,  6.40it/s][A[A

  2%|▏         | 2/91 [00:00<00:16,  5.41it/s][A[A

  3%|▎         | 3/91 [00:00<00:19,  4.58it/s][A[A

  4%|▍         | 4/91 [00:00<00:22,  3.93it/s][A[A

  5%|▌         | 5/91 [00:01<00:27,  3.17it/s][A[A

  7%|▋         | 6/91 [00:01<00:32,  2.60it/s][A[A

  8%|▊         | 7/91 [00:02<00:34,  2.41it/s][A[A

  9%|▉         | 8/91 [00:02<00:37,  2.23it/s][A[A

 10%|▉         | 9/91 [00:03<00:43,  1.87it/s][A[A

 11%|█         | 10/91 [00:04<00:48,  1.68it/s][A[A

 12%|█▏        | 11/91 [00:05<00:52,  1.52it/s][A[A

 13%|█▎        | 12/91 [00:05<00:55,  1.42it/s][A[A

 14%|█▍        | 13/91 [00:06<01:02,  1.25it/s][A[A

 15%|█▌        | 14/91 [00:07<01:02,  1.23it/s][A[A

 16%|█▋        | 15/91 [00:08<01:03,  1.19it/s][A[A

 18%|█▊        | 16/91 [00:09<01:05,  1.14it/s][A[A

 19%|█▊        | 17/91 [00:10<01:08,  1.09it/s][A[A

 20%|█▉        | 18/91 [00

most_frequent




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:15,  5.78it/s][A[A

  2%|▏         | 2/91 [00:00<00:18,  4.86it/s][A[A

  3%|▎         | 3/91 [00:00<00:21,  4.10it/s][A[A

  4%|▍         | 4/91 [00:01<00:24,  3.56it/s][A[A

  5%|▌         | 5/91 [00:01<00:27,  3.09it/s][A[A

  7%|▋         | 6/91 [00:01<00:31,  2.71it/s][A[A

  8%|▊         | 7/91 [00:02<00:35,  2.40it/s][A[A

  9%|▉         | 8/91 [00:02<00:39,  2.13it/s][A[A

 10%|▉         | 9/91 [00:03<00:43,  1.90it/s][A[A

 11%|█         | 10/91 [00:04<00:47,  1.72it/s][A[A

 12%|█▏        | 11/91 [00:05<00:54,  1.47it/s][A[A

 13%|█▎        | 12/91 [00:06<00:57,  1.37it/s][A[A

 14%|█▍        | 13/91 [00:06<01:00,  1.29it/s][A[A

 15%|█▌        | 14/91 [00:08<01:06,  1.15it/s][A[A

 16%|█▋        | 15/91 [00:09<01:12,  1.05it/s][A[A

 18%|█▊        | 16/91 [00:10<01:14,  1.00it/s][A[A

 19%|█▊        | 17/91 [00:11<01:16,  1.04s/it][A[A

 20%|█▉        | 18/91 [00

most_frequent




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:13,  6.45it/s][A[A

  2%|▏         | 2/91 [00:00<00:18,  4.71it/s][A[A

  3%|▎         | 3/91 [00:00<00:22,  3.92it/s][A[A

  4%|▍         | 4/91 [00:01<00:26,  3.33it/s][A[A

  5%|▌         | 5/91 [00:01<00:29,  2.88it/s][A[A

  7%|▋         | 6/91 [00:01<00:31,  2.69it/s][A[A

  8%|▊         | 7/91 [00:02<00:32,  2.57it/s][A[A

  9%|▉         | 8/91 [00:02<00:34,  2.42it/s][A[A

 10%|▉         | 9/91 [00:03<00:36,  2.25it/s][A[A

 11%|█         | 10/91 [00:03<00:40,  2.01it/s][A[A

 12%|█▏        | 11/91 [00:04<00:46,  1.73it/s][A[A

 13%|█▎        | 12/91 [00:05<00:47,  1.66it/s][A[A

 14%|█▍        | 13/91 [00:06<00:49,  1.58it/s][A[A

 15%|█▌        | 14/91 [00:06<00:51,  1.49it/s][A[A

 16%|█▋        | 15/91 [00:07<00:53,  1.41it/s][A[A

 18%|█▊        | 16/91 [00:08<00:56,  1.32it/s][A[A

 19%|█▊        | 17/91 [00:09<00:59,  1.24it/s][A[A

 20%|█▉        | 18/91 [00

most_frequent




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:13,  6.72it/s][A[A

  2%|▏         | 2/91 [00:00<00:16,  5.55it/s][A[A

  3%|▎         | 3/91 [00:00<00:18,  4.65it/s][A[A

  4%|▍         | 4/91 [00:00<00:22,  3.91it/s][A[A

  5%|▌         | 5/91 [00:01<00:25,  3.41it/s][A[A

  7%|▋         | 6/91 [00:01<00:30,  2.80it/s][A[A

  8%|▊         | 7/91 [00:02<00:34,  2.47it/s][A[A

  9%|▉         | 8/91 [00:02<00:36,  2.25it/s][A[A

 10%|▉         | 9/91 [00:03<00:39,  2.05it/s][A[A

 11%|█         | 10/91 [00:04<00:42,  1.89it/s][A[A

 12%|█▏        | 11/91 [00:04<00:46,  1.73it/s][A[A

 13%|█▎        | 12/91 [00:05<00:52,  1.52it/s][A[A

 14%|█▍        | 13/91 [00:06<00:54,  1.42it/s][A[A

 15%|█▌        | 14/91 [00:07<01:00,  1.28it/s][A[A

 16%|█▋        | 15/91 [00:08<01:02,  1.22it/s][A[A

 18%|█▊        | 16/91 [00:09<01:04,  1.15it/s][A[A

 19%|█▊        | 17/91 [00:10<01:10,  1.04it/s][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:41,  2.16it/s][A[A

  2%|▏         | 2/91 [00:01<00:46,  1.91it/s][A[A

  3%|▎         | 3/91 [00:01<00:55,  1.59it/s][A[A

  4%|▍         | 4/91 [00:02<01:04,  1.34it/s][A[A

  5%|▌         | 5/91 [00:04<01:22,  1.04it/s][A[A

  7%|▋         | 6/91 [00:05<01:31,  1.07s/it][A[A

  8%|▊         | 7/91 [00:06<01:43,  1.24s/it][A[A

  9%|▉         | 8/91 [00:08<01:55,  1.40s/it][A[A

 10%|▉         | 9/91 [00:10<02:09,  1.58s/it][A[A

 11%|█         | 10/91 [00:12<02:20,  1.73s/it][A[A

 12%|█▏        | 11/91 [00:14<02:30,  1.88s/it][A[A

 13%|█▎        | 12/91 [00:17<02:48,  2.13s/it][A[A

 14%|█▍        | 13/91 [00:20<02:59,  2.30s/it][A[A

 15%|█▌        | 14/91 [00:23<03:13,  2.51s/it][A[A

 16%|█▋        | 15/91 [00:26<03:21,  2.65s/it][A[A

 18%|█▊        | 16/91 [00:29<03:31,  2.82s/it][A[A

 19%|█▊        | 17/91 [00:33<03:45,  3.04s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:23,  3.83it/s][A[A

  2%|▏         | 2/91 [00:00<00:28,  3.14it/s][A[A

  3%|▎         | 3/91 [00:01<00:36,  2.40it/s][A[A

  4%|▍         | 4/91 [00:01<00:42,  2.05it/s][A[A

  5%|▌         | 5/91 [00:02<00:47,  1.80it/s][A[A

  7%|▋         | 6/91 [00:03<00:53,  1.59it/s][A[A

  8%|▊         | 7/91 [00:04<01:01,  1.37it/s][A[A

  9%|▉         | 8/91 [00:05<01:14,  1.11it/s][A[A

 10%|▉         | 9/91 [00:06<01:19,  1.03it/s][A[A

 11%|█         | 10/91 [00:07<01:25,  1.06s/it][A[A

 12%|█▏        | 11/91 [00:09<01:30,  1.13s/it][A[A

 13%|█▎        | 12/91 [00:10<01:35,  1.21s/it][A[A

 14%|█▍        | 13/91 [00:12<01:42,  1.32s/it][A[A

 15%|█▌        | 14/91 [00:13<01:49,  1.42s/it][A[A

 16%|█▋        | 15/91 [00:15<01:55,  1.52s/it][A[A

 18%|█▊        | 16/91 [00:17<02:04,  1.66s/it][A[A

 19%|█▊        | 17/91 [00:19<02:14,  1.82s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:27,  3.24it/s][A[A

  2%|▏         | 2/91 [00:00<00:31,  2.79it/s][A[A

  3%|▎         | 3/91 [00:01<00:42,  2.06it/s][A[A

  4%|▍         | 4/91 [00:02<00:48,  1.78it/s][A[A

  5%|▌         | 5/91 [00:02<00:56,  1.52it/s][A[A

  7%|▋         | 6/91 [00:04<01:10,  1.21it/s][A[A

  8%|▊         | 7/91 [00:05<01:14,  1.13it/s][A[A

  9%|▉         | 8/91 [00:06<01:19,  1.05it/s][A[A

 10%|▉         | 9/91 [00:07<01:25,  1.04s/it][A[A

 11%|█         | 10/91 [00:08<01:32,  1.15s/it][A[A

 12%|█▏        | 11/91 [00:10<01:41,  1.27s/it][A[A

 13%|█▎        | 12/91 [00:11<01:49,  1.38s/it][A[A

 14%|█▍        | 13/91 [00:13<01:55,  1.48s/it][A[A

 15%|█▌        | 14/91 [00:15<02:02,  1.59s/it][A[A

 16%|█▋        | 15/91 [00:17<02:12,  1.74s/it][A[A

 18%|█▊        | 16/91 [00:19<02:17,  1.83s/it][A[A

 19%|█▊        | 17/91 [00:21<02:23,  1.94s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:23,  3.89it/s][A[A

  2%|▏         | 2/91 [00:00<00:25,  3.48it/s][A[A

  3%|▎         | 3/91 [00:00<00:29,  3.02it/s][A[A

  4%|▍         | 4/91 [00:01<00:33,  2.63it/s][A[A

  5%|▌         | 5/91 [00:01<00:38,  2.25it/s][A[A

  7%|▋         | 6/91 [00:02<00:43,  1.96it/s][A[A

  8%|▊         | 7/91 [00:03<00:48,  1.73it/s][A[A

  9%|▉         | 8/91 [00:04<00:56,  1.46it/s][A[A

 10%|▉         | 9/91 [00:05<01:02,  1.32it/s][A[A

 11%|█         | 10/91 [00:06<01:06,  1.21it/s][A[A

 12%|█▏        | 11/91 [00:07<01:11,  1.12it/s][A[A

 13%|█▎        | 12/91 [00:08<01:16,  1.03it/s][A[A

 14%|█▍        | 13/91 [00:09<01:23,  1.07s/it][A[A

 15%|█▌        | 14/91 [00:11<01:30,  1.17s/it][A[A

 16%|█▋        | 15/91 [00:12<01:42,  1.34s/it][A[A

 18%|█▊        | 16/91 [00:14<01:51,  1.49s/it][A[A

 19%|█▊        | 17/91 [00:16<01:55,  1.57s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:42,  2.10it/s][A[A

  2%|▏         | 2/91 [00:00<00:36,  2.45it/s][A[A

  3%|▎         | 3/91 [00:01<00:36,  2.39it/s][A[A

  4%|▍         | 4/91 [00:01<00:44,  1.96it/s][A[A

  5%|▌         | 5/91 [00:02<00:54,  1.59it/s][A[A

  7%|▋         | 6/91 [00:03<00:56,  1.49it/s][A[A

  8%|▊         | 7/91 [00:04<01:01,  1.37it/s][A[A

  9%|▉         | 8/91 [00:05<01:10,  1.17it/s][A[A

 10%|▉         | 9/91 [00:06<01:15,  1.09it/s][A[A

 11%|█         | 10/91 [00:07<01:20,  1.00it/s][A[A

 12%|█▏        | 11/91 [00:08<01:26,  1.08s/it][A[A

 13%|█▎        | 12/91 [00:10<01:33,  1.18s/it][A[A

 14%|█▍        | 13/91 [00:11<01:41,  1.31s/it][A[A

 15%|█▌        | 14/91 [00:13<01:49,  1.42s/it][A[A

 16%|█▋        | 15/91 [00:15<01:54,  1.50s/it][A[A

 18%|█▊        | 16/91 [00:17<01:59,  1.60s/it][A[A

 19%|█▊        | 17/91 [00:19<02:06,  1.71s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude_and_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:28,  3.20it/s][A[A

  2%|▏         | 2/91 [00:00<00:33,  2.66it/s][A[A

  3%|▎         | 3/91 [00:01<00:49,  1.77it/s][A[A

  4%|▍         | 4/91 [00:02<00:54,  1.60it/s][A[A

  5%|▌         | 5/91 [00:03<01:00,  1.43it/s][A[A

  7%|▋         | 6/91 [00:04<01:07,  1.26it/s][A[A

  8%|▊         | 7/91 [00:05<01:17,  1.09it/s][A[A

  9%|▉         | 8/91 [00:06<01:26,  1.05s/it][A[A

 10%|▉         | 9/91 [00:08<01:40,  1.22s/it][A[A

 11%|█         | 10/91 [00:09<01:49,  1.35s/it][A[A

 12%|█▏        | 11/91 [00:11<01:55,  1.44s/it][A[A

 13%|█▎        | 12/91 [00:13<02:03,  1.56s/it][A[A

 14%|█▍        | 13/91 [00:15<02:13,  1.71s/it][A[A

 15%|█▌        | 14/91 [00:17<02:23,  1.86s/it][A[A

 16%|█▋        | 15/91 [00:19<02:33,  2.02s/it][A[A

 18%|█▊        | 16/91 [00:22<02:44,  2.19s/it][A[A

 19%|█▊        | 17/91 [00:25<02:49,  2.30s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude_and_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:26,  3.45it/s][A[A

  2%|▏         | 2/91 [00:00<00:25,  3.54it/s][A[A

  3%|▎         | 3/91 [00:00<00:28,  3.10it/s][A[A

  4%|▍         | 4/91 [00:01<00:32,  2.67it/s][A[A

  5%|▌         | 5/91 [00:01<00:36,  2.34it/s][A[A

  7%|▋         | 6/91 [00:02<00:42,  2.02it/s][A[A

  8%|▊         | 7/91 [00:03<00:47,  1.77it/s][A[A

  9%|▉         | 8/91 [00:04<00:53,  1.56it/s][A[A

 10%|▉         | 9/91 [00:04<00:58,  1.40it/s][A[A

 11%|█         | 10/91 [00:05<01:04,  1.25it/s][A[A

 12%|█▏        | 11/91 [00:06<01:10,  1.14it/s][A[A

 13%|█▎        | 12/91 [00:08<01:19,  1.01s/it][A[A

 14%|█▍        | 13/91 [00:09<01:25,  1.09s/it][A[A

 15%|█▌        | 14/91 [00:11<01:33,  1.22s/it][A[A

 16%|█▋        | 15/91 [00:12<01:40,  1.32s/it][A[A

 18%|█▊        | 16/91 [00:14<01:42,  1.37s/it][A[A

 19%|█▊        | 17/91 [00:15<01:47,  1.46s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude_and_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:20,  4.39it/s][A[A

  2%|▏         | 2/91 [00:00<00:25,  3.48it/s][A[A

  3%|▎         | 3/91 [00:01<00:35,  2.50it/s][A[A

  4%|▍         | 4/91 [00:01<00:40,  2.15it/s][A[A

  5%|▌         | 5/91 [00:02<00:44,  1.95it/s][A[A

  7%|▋         | 6/91 [00:02<00:48,  1.74it/s][A[A

  8%|▊         | 7/91 [00:03<00:54,  1.55it/s][A[A

  9%|▉         | 8/91 [00:04<01:00,  1.38it/s][A[A

 10%|▉         | 9/91 [00:05<01:10,  1.17it/s][A[A

 11%|█         | 10/91 [00:06<01:14,  1.09it/s][A[A

 12%|█▏        | 11/91 [00:08<01:20,  1.00s/it][A[A

 13%|█▎        | 12/91 [00:09<01:25,  1.09s/it][A[A

 14%|█▍        | 13/91 [00:10<01:35,  1.22s/it][A[A

 15%|█▌        | 14/91 [00:12<01:41,  1.32s/it][A[A

 16%|█▋        | 15/91 [00:14<01:50,  1.46s/it][A[A

 18%|█▊        | 16/91 [00:15<01:57,  1.57s/it][A[A

 19%|█▊        | 17/91 [00:17<02:04,  1.68s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude_and_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:15,  5.76it/s][A[A

  2%|▏         | 2/91 [00:00<00:19,  4.65it/s][A[A

  3%|▎         | 3/91 [00:00<00:23,  3.68it/s][A[A

  4%|▍         | 4/91 [00:01<00:27,  3.22it/s][A[A

  5%|▌         | 5/91 [00:01<00:31,  2.72it/s][A[A

  7%|▋         | 6/91 [00:02<00:38,  2.21it/s][A[A

  8%|▊         | 7/91 [00:02<00:41,  2.01it/s][A[A

  9%|▉         | 8/91 [00:03<00:50,  1.63it/s][A[A

 10%|▉         | 9/91 [00:04<00:58,  1.41it/s][A[A

 11%|█         | 10/91 [00:05<00:59,  1.36it/s][A[A

 12%|█▏        | 11/91 [00:06<01:01,  1.30it/s][A[A

 13%|█▎        | 12/91 [00:07<01:04,  1.23it/s][A[A

 14%|█▍        | 13/91 [00:08<01:07,  1.15it/s][A[A

 15%|█▌        | 14/91 [00:09<01:11,  1.07it/s][A[A

 16%|█▋        | 15/91 [00:10<01:20,  1.06s/it][A[A

 18%|█▊        | 16/91 [00:11<01:22,  1.11s/it][A[A

 19%|█▊        | 17/91 [00:13<01:31,  1.23s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_certitude_and_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:18,  4.90it/s][A[A

  2%|▏         | 2/91 [00:00<00:21,  4.10it/s][A[A

  3%|▎         | 3/91 [00:00<00:25,  3.39it/s][A[A

  4%|▍         | 4/91 [00:01<00:30,  2.81it/s][A[A

  5%|▌         | 5/91 [00:01<00:40,  2.15it/s][A[A

  7%|▋         | 6/91 [00:02<00:43,  1.95it/s][A[A

  8%|▊         | 7/91 [00:03<00:52,  1.59it/s][A[A

  9%|▉         | 8/91 [00:04<00:56,  1.46it/s][A[A

 10%|▉         | 9/91 [00:05<01:00,  1.35it/s][A[A

 11%|█         | 10/91 [00:06<01:05,  1.24it/s][A[A

 12%|█▏        | 11/91 [00:07<01:09,  1.14it/s][A[A

 13%|█▎        | 12/91 [00:08<01:13,  1.07it/s][A[A

 14%|█▍        | 13/91 [00:09<01:18,  1.01s/it][A[A

 15%|█▌        | 14/91 [00:10<01:25,  1.11s/it][A[A

 16%|█▋        | 15/91 [00:12<01:32,  1.22s/it][A[A

 18%|█▊        | 16/91 [00:13<01:36,  1.28s/it][A[A

 19%|█▊        | 17/91 [00:15<01:44,  1.41s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:35,  2.51it/s][A[A

  2%|▏         | 2/91 [00:00<00:44,  2.02it/s][A[A

  3%|▎         | 3/91 [00:01<00:53,  1.66it/s][A[A

  4%|▍         | 4/91 [00:02<01:02,  1.39it/s][A[A

  5%|▌         | 5/91 [00:03<01:16,  1.12it/s][A[A

  7%|▋         | 6/91 [00:05<01:34,  1.12s/it][A[A

  8%|▊         | 7/91 [00:06<01:46,  1.27s/it][A[A

  9%|▉         | 8/91 [00:08<02:04,  1.50s/it][A[A

 10%|▉         | 9/91 [00:10<02:14,  1.64s/it][A[A

 11%|█         | 10/91 [00:12<02:21,  1.75s/it][A[A

 12%|█▏        | 11/91 [00:15<02:33,  1.92s/it][A[A

 13%|█▎        | 12/91 [00:17<02:42,  2.06s/it][A[A

 14%|█▍        | 13/91 [00:20<02:53,  2.22s/it][A[A

 15%|█▌        | 14/91 [00:22<03:03,  2.39s/it][A[A

 16%|█▋        | 15/91 [00:26<03:23,  2.68s/it][A[A

 18%|█▊        | 16/91 [00:29<03:35,  2.87s/it][A[A

 19%|█▊        | 17/91 [00:33<03:47,  3.08s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:21,  4.15it/s][A[A

  2%|▏         | 2/91 [00:00<00:26,  3.36it/s][A[A

  3%|▎         | 3/91 [00:01<00:31,  2.79it/s][A[A

  4%|▍         | 4/91 [00:01<00:37,  2.33it/s][A[A

  5%|▌         | 5/91 [00:02<00:43,  1.98it/s][A[A

  7%|▋         | 6/91 [00:02<00:49,  1.71it/s][A[A

  8%|▊         | 7/91 [00:03<01:00,  1.39it/s][A[A

  9%|▉         | 8/91 [00:04<01:06,  1.24it/s][A[A

 10%|▉         | 9/91 [00:05<01:12,  1.14it/s][A[A

 11%|█         | 10/91 [00:07<01:19,  1.02it/s][A[A

 12%|█▏        | 11/91 [00:08<01:28,  1.11s/it][A[A

 13%|█▎        | 12/91 [00:09<01:35,  1.20s/it][A[A

 14%|█▍        | 13/91 [00:11<01:44,  1.34s/it][A[A

 15%|█▌        | 14/91 [00:13<01:48,  1.41s/it][A[A

 16%|█▋        | 15/91 [00:15<01:56,  1.54s/it][A[A

 18%|█▊        | 16/91 [00:16<02:04,  1.66s/it][A[A

 19%|█▊        | 17/91 [00:19<02:11,  1.77s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:35,  2.51it/s][A[A

  2%|▏         | 2/91 [00:00<00:39,  2.27it/s][A[A

  3%|▎         | 3/91 [00:01<00:43,  2.01it/s][A[A

  4%|▍         | 4/91 [00:02<00:52,  1.64it/s][A[A

  5%|▌         | 5/91 [00:02<00:57,  1.50it/s][A[A

  7%|▋         | 6/91 [00:03<01:04,  1.31it/s][A[A

  8%|▊         | 7/91 [00:04<01:11,  1.18it/s][A[A

  9%|▉         | 8/91 [00:06<01:17,  1.07it/s][A[A

 10%|▉         | 9/91 [00:07<01:24,  1.04s/it][A[A

 11%|█         | 10/91 [00:08<01:32,  1.14s/it][A[A

 12%|█▏        | 11/91 [00:10<01:46,  1.33s/it][A[A

 13%|█▎        | 12/91 [00:12<01:52,  1.43s/it][A[A

 14%|█▍        | 13/91 [00:14<02:03,  1.58s/it][A[A

 15%|█▌        | 14/91 [00:15<02:09,  1.68s/it][A[A

 16%|█▋        | 15/91 [00:18<02:15,  1.79s/it][A[A

 18%|█▊        | 16/91 [00:20<02:22,  1.90s/it][A[A

 19%|█▊        | 17/91 [00:22<02:32,  2.06s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:27,  3.29it/s][A[A

  2%|▏         | 2/91 [00:00<00:33,  2.67it/s][A[A

  3%|▎         | 3/91 [00:01<00:32,  2.68it/s][A[A

  4%|▍         | 4/91 [00:01<00:35,  2.46it/s][A[A

  5%|▌         | 5/91 [00:02<00:39,  2.18it/s][A[A

  7%|▋         | 6/91 [00:02<00:44,  1.90it/s][A[A

  8%|▊         | 7/91 [00:03<00:49,  1.70it/s][A[A

  9%|▉         | 8/91 [00:04<00:56,  1.46it/s][A[A

 10%|▉         | 9/91 [00:05<01:03,  1.29it/s][A[A

 11%|█         | 10/91 [00:06<01:08,  1.18it/s][A[A

 12%|█▏        | 11/91 [00:07<01:15,  1.05it/s][A[A

 13%|█▎        | 12/91 [00:08<01:20,  1.02s/it][A[A

 14%|█▍        | 13/91 [00:09<01:24,  1.08s/it][A[A

 15%|█▌        | 14/91 [00:11<01:28,  1.15s/it][A[A

 16%|█▋        | 15/91 [00:12<01:32,  1.22s/it][A[A

 18%|█▊        | 16/91 [00:14<01:39,  1.33s/it][A[A

 19%|█▊        | 17/91 [00:15<01:43,  1.40s/it][A[A

 20%|█▉        | 18/91 [00

weighted_by_expertise




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:22,  4.00it/s][A[A

  2%|▏         | 2/91 [00:00<00:31,  2.84it/s][A[A

  3%|▎         | 3/91 [00:01<00:36,  2.44it/s][A[A

  4%|▍         | 4/91 [00:01<00:41,  2.11it/s][A[A

  5%|▌         | 5/91 [00:02<00:46,  1.83it/s][A[A

  7%|▋         | 6/91 [00:03<00:53,  1.60it/s][A[A

  8%|▊         | 7/91 [00:04<01:01,  1.36it/s][A[A

  9%|▉         | 8/91 [00:05<01:15,  1.09it/s][A[A

 10%|▉         | 9/91 [00:06<01:20,  1.02it/s][A[A

 11%|█         | 10/91 [00:07<01:25,  1.06s/it][A[A

 12%|█▏        | 11/91 [00:09<01:31,  1.14s/it][A[A

 13%|█▎        | 12/91 [00:10<01:43,  1.31s/it][A[A

 14%|█▍        | 13/91 [00:12<01:49,  1.41s/it][A[A

 15%|█▌        | 14/91 [00:14<01:55,  1.50s/it][A[A

 16%|█▋        | 15/91 [00:15<01:58,  1.56s/it][A[A

 18%|█▊        | 16/91 [00:17<02:02,  1.63s/it][A[A

 19%|█▊        | 17/91 [00:19<02:16,  1.84s/it][A[A

 20%|█▉        | 18/91 [00

log_odds




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:25,  3.49it/s][A[A

  2%|▏         | 2/91 [00:00<00:31,  2.87it/s][A[A

  3%|▎         | 3/91 [00:01<00:42,  2.06it/s][A[A

  4%|▍         | 4/91 [00:02<00:50,  1.73it/s][A[A

  5%|▌         | 5/91 [00:02<00:54,  1.57it/s][A[A

  7%|▋         | 6/91 [00:03<01:00,  1.40it/s][A[A

  8%|▊         | 7/91 [00:04<01:08,  1.23it/s][A[A

  9%|▉         | 8/91 [00:05<01:15,  1.10it/s][A[A

 10%|▉         | 9/91 [00:07<01:26,  1.05s/it][A[A

 11%|█         | 10/91 [00:08<01:32,  1.15s/it][A[A

 12%|█▏        | 11/91 [00:10<01:41,  1.27s/it][A[A

 13%|█▎        | 12/91 [00:11<01:48,  1.37s/it][A[A

 14%|█▍        | 13/91 [00:13<01:56,  1.49s/it][A[A

 15%|█▌        | 14/91 [00:15<02:05,  1.64s/it][A[A

 16%|█▋        | 15/91 [00:17<02:17,  1.81s/it][A[A

 18%|█▊        | 16/91 [00:19<02:21,  1.88s/it][A[A

 19%|█▊        | 17/91 [00:21<02:26,  1.98s/it][A[A

 20%|█▉        | 18/91 [00

log_odds




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:16,  5.44it/s][A[A

  2%|▏         | 2/91 [00:00<00:20,  4.45it/s][A[A

  3%|▎         | 3/91 [00:00<00:24,  3.64it/s][A[A

  4%|▍         | 4/91 [00:01<00:28,  3.05it/s][A[A

  5%|▌         | 5/91 [00:01<00:33,  2.55it/s][A[A

  7%|▋         | 6/91 [00:02<00:37,  2.24it/s][A[A

  8%|▊         | 7/91 [00:02<00:42,  1.99it/s][A[A

  9%|▉         | 8/91 [00:03<00:47,  1.73it/s][A[A

 10%|▉         | 9/91 [00:04<00:55,  1.48it/s][A[A

 11%|█         | 10/91 [00:05<00:58,  1.38it/s][A[A

 12%|█▏        | 11/91 [00:06<01:06,  1.19it/s][A[A

 13%|█▎        | 12/91 [00:07<01:11,  1.11it/s][A[A

 14%|█▍        | 13/91 [00:08<01:15,  1.03it/s][A[A

 15%|█▌        | 14/91 [00:09<01:18,  1.02s/it][A[A

 16%|█▋        | 15/91 [00:11<01:26,  1.14s/it][A[A

 18%|█▊        | 16/91 [00:12<01:29,  1.19s/it][A[A

 19%|█▊        | 17/91 [00:13<01:34,  1.28s/it][A[A

 20%|█▉        | 18/91 [00

log_odds




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:19,  4.70it/s][A[A

  2%|▏         | 2/91 [00:00<00:22,  3.92it/s][A[A

  3%|▎         | 3/91 [00:00<00:26,  3.31it/s][A[A

  4%|▍         | 4/91 [00:01<00:31,  2.77it/s][A[A

  5%|▌         | 5/91 [00:01<00:36,  2.36it/s][A[A

  7%|▋         | 6/91 [00:02<00:41,  2.03it/s][A[A

  8%|▊         | 7/91 [00:03<00:46,  1.80it/s][A[A

  9%|▉         | 8/91 [00:03<00:51,  1.61it/s][A[A

 10%|▉         | 9/91 [00:04<00:56,  1.44it/s][A[A

 11%|█         | 10/91 [00:05<01:03,  1.28it/s][A[A

 12%|█▏        | 11/91 [00:06<01:08,  1.17it/s][A[A

 13%|█▎        | 12/91 [00:07<01:13,  1.07it/s][A[A

 14%|█▍        | 13/91 [00:09<01:22,  1.05s/it][A[A

 15%|█▌        | 14/91 [00:10<01:31,  1.18s/it][A[A

 16%|█▋        | 15/91 [00:12<01:33,  1.23s/it][A[A

 18%|█▊        | 16/91 [00:13<01:36,  1.29s/it][A[A

 19%|█▊        | 17/91 [00:15<01:40,  1.36s/it][A[A

 20%|█▉        | 18/91 [00

log_odds




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:15,  5.90it/s][A[A

  2%|▏         | 2/91 [00:00<00:18,  4.94it/s][A[A

  3%|▎         | 3/91 [00:00<00:20,  4.20it/s][A[A

  4%|▍         | 4/91 [00:01<00:24,  3.50it/s][A[A

  5%|▌         | 5/91 [00:01<00:28,  3.04it/s][A[A

  7%|▋         | 6/91 [00:01<00:32,  2.62it/s][A[A

  8%|▊         | 7/91 [00:02<00:38,  2.18it/s][A[A

  9%|▉         | 8/91 [00:03<00:42,  1.96it/s][A[A

 10%|▉         | 9/91 [00:03<00:45,  1.78it/s][A[A

 11%|█         | 10/91 [00:04<00:49,  1.63it/s][A[A

 12%|█▏        | 11/91 [00:05<00:53,  1.50it/s][A[A

 13%|█▎        | 12/91 [00:06<00:56,  1.40it/s][A[A

 14%|█▍        | 13/91 [00:07<01:03,  1.22it/s][A[A

 15%|█▌        | 14/91 [00:08<01:07,  1.14it/s][A[A

 16%|█▋        | 15/91 [00:09<01:10,  1.08it/s][A[A

 18%|█▊        | 16/91 [00:10<01:14,  1.01it/s][A[A

 19%|█▊        | 17/91 [00:11<01:17,  1.05s/it][A[A

 20%|█▉        | 18/91 [00

log_odds




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:31,  2.88it/s][A[A

  2%|▏         | 2/91 [00:00<00:32,  2.75it/s][A[A

  3%|▎         | 3/91 [00:01<00:35,  2.50it/s][A[A

  4%|▍         | 4/91 [00:01<00:38,  2.28it/s][A[A

  5%|▌         | 5/91 [00:02<00:38,  2.23it/s][A[A

  7%|▋         | 6/91 [00:02<00:44,  1.90it/s][A[A

  8%|▊         | 7/91 [00:03<00:46,  1.81it/s][A[A

  9%|▉         | 8/91 [00:04<00:49,  1.69it/s][A[A

 10%|▉         | 9/91 [00:04<00:55,  1.47it/s][A[A

 11%|█         | 10/91 [00:05<00:58,  1.37it/s][A[A

 12%|█▏        | 11/91 [00:06<01:02,  1.29it/s][A[A

 13%|█▎        | 12/91 [00:07<01:05,  1.20it/s][A[A

 14%|█▍        | 13/91 [00:08<01:11,  1.09it/s][A[A

 15%|█▌        | 14/91 [00:09<01:14,  1.03it/s][A[A

 16%|█▋        | 15/91 [00:11<01:20,  1.05s/it][A[A

 18%|█▊        | 16/91 [00:12<01:26,  1.16s/it][A[A

 19%|█▊        | 17/91 [00:13<01:29,  1.21s/it][A[A

 20%|█▉        | 18/91 [00

fuzzy_logic_aggregation




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:23,  3.86it/s][A[A

  2%|▏         | 2/91 [00:00<00:28,  3.17it/s][A[A

  3%|▎         | 3/91 [00:01<00:32,  2.68it/s][A[A

  4%|▍         | 4/91 [00:01<00:39,  2.22it/s][A[A

  5%|▌         | 5/91 [00:02<00:44,  1.93it/s][A[A

  7%|▋         | 6/91 [00:02<00:50,  1.69it/s][A[A

  8%|▊         | 7/91 [00:03<00:57,  1.46it/s][A[A

  9%|▉         | 8/91 [00:04<01:07,  1.23it/s][A[A

 10%|▉         | 9/91 [00:06<01:12,  1.13it/s][A[A

 11%|█         | 10/91 [00:07<01:23,  1.04s/it][A[A

 12%|█▏        | 11/91 [00:08<01:28,  1.10s/it][A[A

 13%|█▎        | 12/91 [00:10<01:33,  1.18s/it][A[A

 14%|█▍        | 13/91 [00:11<01:40,  1.29s/it][A[A

 15%|█▌        | 14/91 [00:13<01:46,  1.38s/it][A[A

 16%|█▋        | 15/91 [00:14<01:51,  1.47s/it][A[A

 18%|█▊        | 16/91 [00:16<02:00,  1.61s/it][A[A

 19%|█▊        | 17/91 [00:18<02:07,  1.72s/it][A[A

 20%|█▉        | 18/91 [00

fuzzy_logic_aggregation




  0%|          | 0/91 [00:00<?, ?it/s][A[A

  1%|          | 1/91 [00:00<00:31,  2.87it/s][A[A

  2%|▏         | 2/91 [00:00<00:26,  3.33it/s][A[A

  3%|▎         | 3/91 [00:00<00:25,  3.39it/s][A[A

  4%|▍         | 4/91 [00:01<00:27,  3.14it/s][A[A

  5%|▌         | 5/91 [00:01<00:30,  2.83it/s][A[A

  7%|▋         | 6/91 [00:02<00:33,  2.51it/s][A[A

  8%|▊         | 7/91 [00:02<00:37,  2.23it/s][A[A

  9%|▉         | 8/91 [00:03<00:41,  1.99it/s][A[A

 10%|▉         | 9/91 [00:04<00:45,  1.79it/s][A[A

 11%|█         | 10/91 [00:04<00:49,  1.62it/s][A[A

 12%|█▏        | 11/91 [00:05<00:54,  1.48it/s][A[A

 13%|█▎        | 12/91 [00:06<00:58,  1.36it/s][A[A

 14%|█▍        | 13/91 [00:07<01:02,  1.25it/s][A[A

 15%|█▌        | 14/91 [00:08<01:06,  1.16it/s][A[A

 16%|█▋        | 15/91 [00:09<01:10,  1.08it/s][A[A

 18%|█▊        | 16/91 [00:10<01:14,  1.01it/s][A[A

 19%|█▊        | 17/91 [00:11<01:17,  1.05s/it][A[A

 20%|█▉        | 18/91 [00

- Per group

In [8]:
for weigth in all_agg:
    subgroups_dict, combined_df = chemist_group_analysis_group(df_A, df_B, assign_chemist_group, compute_most_frequent, weigth)

most_frequent


100%|██████████| 93/93 [12:22<00:00,  7.98s/it]


weighted_by_certitude


100%|██████████| 93/93 [29:37<00:00, 19.11s/it]


weighted_by_certitude_and_expertise


100%|██████████| 93/93 [21:50<00:00, 14.09s/it]


weighted_by_expertise


100%|██████████| 93/93 [29:36<00:00, 19.10s/it]


log_odds


100%|██████████| 93/93 [18:12<00:00, 11.75s/it]


fuzzy_logic_aggregation


100%|██████████| 93/93 [15:42<00:00, 10.14s/it]
