# Data conversion

First, read in json data to dataframe, see `convert_json.py` for details.

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np

from utils import COUNTRIES

custom_params = {"axes.spines.right": False, "axes.spines.top": False}

sn.set_theme(style="ticks", rc=custom_params)

In [None]:
import importlib
import convert_json
import utils
import plotting
import model_utils
from pathlib import Path


importlib.reload(convert_json)
importlib.reload(model_utils)
importlib.reload(utils)
importlib.reload(plotting);

ROOT = Path("data")

# Analyze Ratings

In [None]:
audio, image, text = model_utils.load_long_data(filter_type="interval")
merged = pd.concat((audio, image, text))


users_per_country = merged.groupby(["country"])["id"].nunique()
users_per_country, users_per_country.sum()

In [None]:
merged.columns

In [None]:
def category_perc(data, category):
    # Group the DataFrame by gender and country
    grouped_df = data.groupby([category, "country"])

    count_df = grouped_df["id"].nunique().unstack()
    percentage_df = (count_df / count_df.sum(axis=0) * 100).round(2)

    sorted_df = pd.concat([count_df, percentage_df], axis=1).sort_index(axis=1)
    sorted_df.columns = pd.MultiIndex.from_frame(pd.DataFrame([["China", "n"], ["China", "perc"], [
                                                 "Germany", "n"], ["Germany", "perc"], ["USA", "n"], ["USA", "perc"]], columns=["County", "stat"]))

    return sorted_df[["USA", "Germany", "China"]]


gender = category_perc(merged, "gender_mapped")
print(gender.to_latex())

edu = category_perc(merged, "edu_enc")
print(edu.to_latex())

AGE_MAP = {
    "15-19" : "18-34",
    "20-24" : "18-34",
    "25-29" : "18-34",
    "30-34" : "18-34",
    "35-39" : "35-49",
    "40-44" : "35-49",
    "45-49" : "35-49",
    "50-54" : "50-64",
    "55-59" : "50-64",
    "60-64" : "50-64",
    "65-69" : "65+",
    "70-74" : "65+",
    "75-79" : "65+",
    "80-84" : "65+",
    "85-89" : "65+",
    "90-94" : "65+",
    "95-99" : "65+",
}

merged["age_tab"] = merged["age_bin"].apply(AGE_MAP.get)
age = category_perc(merged, "age_tab")
print(age.to_latex())


def category_mean(data, category, stat, perc=False):
    # Group the DataFrame by gender and country
    grouped_df = data.groupby([category, "country"])

    values_df = grouped_df[stat].agg(["mean", "std"])
    if perc:
        values_df *= 100.

    sorted_df = values_df.round(2).unstack().sort_index(axis=1)
    sorted_df.columns = pd.MultiIndex.from_frame(pd.DataFrame([["China", "mean"], ["China", "sd"], [
                                                 "Germany", "mean"], ["Germany", "sd"], ["USA", "mean"], ["USA", "sd"]], columns=["County", "stat"]))
    return sorted_df[["USA", "Germany", "China"]]


ratings_real = category_mean(
    merged[merged.type == "real"], "media_type_x", "rating")
print(ratings_real.to_latex())

ratings_fake = category_mean(
    merged[merged.type == "fake"], "media_type_x", "rating")
print(ratings_fake.to_latex())


acc_per_user = pd.concat(model_utils.load_acc_data(filter_type="interval"))
stats = (acc_per_user.groupby(["country", "media_type"])[
      "Acc"].agg(["mean", "std"]) * 100.).round(2)
ordered_df = stats.unstack("country").swaplevel(axis=1).sort_index(axis=1)
print(pd.concat([ordered_df["USA"], ordered_df["Germany"], ordered_df["China"]], axis=1).to_latex())


In [None]:
from sklearn.metrics import roc_curve, auc


dat = [(audio, "Audio"), (image, "Image"), (text, "Text")]
def _encode(row):
    if row["correct"]:
        return row["y_true"]
    return (~row["y_true"].astype(bool)).astype(np.uint)

scale = 1.3
fig, axis = plt.subplots(ncols=3, figsize=(18 * scale,5 * scale))
points = np.linspace(0, 1, 100)

for ax, (data, title) in zip(axis, dat):
    roc_data = data[["type", "correct", "country"]].copy()
    roc_data["y_true"], _ = pd.factorize(data["type"], sort=True)


    roc_data["y_pred"] = roc_data[["y_true", "correct"]].apply(_encode, axis=1)

    for country in utils.COUNTRIES:
        country_data = roc_data[roc_data["country"] == country]
        y_true = country_data["y_true"]
        y_pred = country_data["y_pred"]

        ax.plot(points, points, linestyle="--", alpha=.6, color="grey")

        fpr, tpr, _ = roc_curve(
            y_true,
            y_pred
        )
        roc_auc = auc(fpr, tpr)
        ax.plot(fpr, tpr, color=plotting.COUNTRY_COLORS[country], label=f"{country} (AUC: {roc_auc:.2f})")

    ax.set_title(title, fontsize="x-large")
    ax.legend(loc="lower right", fontsize="large")
    ax.set_ylabel("True Positive Rate", fontsize="x-large")
    ax.set_xlabel("False Positive Rate", fontsize="x-large")
    ax.tick_params(axis='both', which='major', labelsize="medium")

fig.tight_layout()
fig.savefig("plot/roc.pdf")

In [None]:
# compute statistics on positive and negative rating
def compute_positive_negative(d: pd.DataFrame) -> pd.DataFrame:
    data = d.copy()
    data["Positive"] = data.rating > 0
    data["Negative"] = data.rating < 0
    data["Undecided"] = data.rating == 0
    data["type"] = data["type"].str.title()

    def calculate_stat(x):
        means = x.mean()
        stds = x.sem()
        
        data = []
        for (index, mean), std in zip(means.iteritems(), stds):
            data.append(f"${mean: 2.2f}\pm{std:.2f}$")
            
        return pd.Series(data, index=means.index)

    stats = data.groupby(["id", "type", "country"])[["Positive", "Undecided", "Negative"]].sum().groupby(["country", "type"]).apply(calculate_stat)
    unstacked = stats.unstack().round(2)
    columns = unstacked.columns
    index = pd.MultiIndex.from_tuples([columns[i+1] if i % 2 == 0 else columns[i-1] for i in range(0, len(columns))])
    unstacked = unstacked.reindex(columns=index)

    print(unstacked.style.to_latex())
    return unstacked

calc_audio = compute_positive_negative(audio)
calc_image = compute_positive_negative(image)
calc_text = compute_positive_negative(text)

In [None]:
audio, image, text = model_utils.load_long_data(drop_zero=True)
merged = pd.concat((audio, image, text))

In [None]:
from pandas import MultiIndex


merged["Positive"] = merged["rating"] > 0
merged["Negative"] = merged["rating"] < 0
merged["country"] = pd.Categorical(merged["country"], utils.COUNTRIES)
merged["type"] = pd.Categorical(merged["type"], ["real", "fake"])

aggregated = merged.groupby(["country", "type", "media_type_x"])[["Positive", "Negative"]].sum()
aggregated["count"] = aggregated["Positive"] + aggregated["Negative"]
aggregated["% Positive"] =  aggregated["Positive"] / aggregated["count"] * 100
aggregated["% Negative"] =  100 -  aggregated["% Positive"]

# format table
formated = aggregated[["% Positive", "% Negative"]].unstack(0).unstack(0).round(2)
formated.index.names = [None]
formated.columns.names = [None, None, None]
formated.index = formated.index.str.title()

index = []
for i in range(len(formated.columns)):
    prev = formated.columns[i]
    index.append((*prev[:-1], prev[-1].title()))

formated.columns = MultiIndex.from_tuples(index)

print(formated.style.to_latex())
formated

# Analyze Accuracy

First we analyze the overall accuracy and then the per step accuracy.

In [None]:
import utils
import importlib
importlib.reload(utils)
importlib.reload(model_utils)

acc_per_user = pd.concat(model_utils.load_acc_data(filter_type="interval"))

acc_per_user.country.value_counts(), len(acc_per_user)

In [None]:
importlib.reload(plotting)

acc_per_user["Acc_real_fake"] = acc_per_user["Acc_real"] - acc_per_user["Acc_fake"]

kwargs = {
    "data":acc_per_user,
    "x":"country",
    "title":"",
    "order":["audio", "image", "text"],
    "colors":plotting.COLORS[-3],
    "kind":"stripplot",
    "plot_kwargs":{
        "y": "Acc",
        "order": plotting.ORDER,
        "jitter": .4,
    },
    "hue":"media_type",
    "ylim":(-.1, 1.1),
    "hline":[.90, .5, .10],
    "figsize":(8,5),
    "y_label": False,
}

plotting.plot_category(
    **kwargs
)

kwargs["plot_kwargs"] = {
        "y": "Acc_real",
        "order": plotting.ORDER,
    }
kwargs["title"] = "Accuarcy (Real)"
plotting.plot_category(
    **kwargs
)

kwargs["plot_kwargs"] = {
        "y": "Acc_fake",
        "order": plotting.ORDER,
    }
kwargs["title"] = "Accuarcy (Fake)"
plotting.plot_category(
    **kwargs
)

kwargs["plot_kwargs"] = {
    "y": "Acc_real_fake",
    "order": plotting.ORDER,
}
kwargs["title"] = "Accuarcy (Real - Fake)"
plotting.plot_category(
    **kwargs
)

In [None]:
from matplotlib.pyplot import text
from scipy.stats import kruskal, bootstrap, mannwhitneyu
from itertools import combinations
from utils import COUNTRIES
import numpy as np

def format_p(val):
    if val < 0.001:
        return "$p < 0.001$"
    else:
        return f"$p = {val:.3f}$"

def compute_statistics(data, category="Acc"):
    # one way ANOVA
    usa = data[data.country == "USA"]
    ger = data[data.country == "Germany"]
    ch = data[data.country == "China"]

    z_stat, pval = kruskal(usa[category], ger[category], ch[category])

    # mean + std
    usa_mean = usa[category].mean()
    usa_std = usa[category].std() 
    usa_skew = usa[category].skew() 

    if category == "Acc":
        usa_mean *= 100
        usa_std *= 100
        usa_skew *= 100

    ger_mean = ger[category].mean() * 100
    ger_std = ger[category].std() * 100
    ger_skew = ger[category].skew() * 100

    if category == "Acc":
        ger_mean *= 100
        ger_std *= 100
        ger_skew *= 100

    ch_mean = ch[category].mean() * 100
    ch_std = ch[category].std() * 100
    ch_skew = ch[category].skew() * 100

    if category == "Acc":
        ch_mean *= 100
        ch_std *= 100
        ch_skew *= 100


    print(f"ANOVA: $Z={z_stat:.3f}$; {format_p(pval)}")
    print(f"Means: USA ${usa_mean:.2f}\pm{usa_std:.2f}$; Germany ${ger_mean:.2f}\pm{ger_std:.2f}$; China ${ch_mean:.2f}\pm{ch_std:.2f}$.")
    print(f"Skew: USA ${usa_skew:.2f}$; Germany ${ger_skew:.2f}$; China ${ch_skew:.2f}$.")

    rows = []

    for A, B in combinations(COUNTRIES, 2):
        a_acc = data[data.country == A]
        b_acc = data[data.country == B]

        n = min(len(a_acc), len(b_acc))
        diff = a_acc[category].iloc[:n].reset_index(drop=True) - b_acc[category].iloc[:n].reset_index(drop=True)
        mean_diff = diff.mean() 
        std_diff = diff.std() 

        if category == "Acc":
            mean_diff *= 100
            std_diff *= 100


        stat_non_parametric, pval_non_parametric = mannwhitneyu(a_acc[category][:n], b_acc[category][:n])
        print(f"{A}/{B} $z={stat_non_parametric:.3f}$; {format_p(pval_non_parametric)}")
        print(f"Difference in means: ${mean_diff:.3f} \pm {std_diff:.3f}$")

        rows.append((A, B, stat_non_parametric, pval_non_parametric, mean_diff, std_diff, f"{mean_diff:.2f}\,±\,{std_diff:.2f}"))

    return pd.DataFrame(
            rows,
            columns=["A", "B", "Z", "pval", "mean_diff", "std_diff", "formatted"],
        )

compute_statistics(acc_per_user, "Acc")
print("=========================")
print("Audio")
print("=========================")
audio_stats = compute_statistics(acc_per_user[acc_per_user.media_type == "audio"], "Acc")
print("=========================")
print("Image")
print("=========================")
image_stats = compute_statistics(acc_per_user[acc_per_user.media_type == "image"], "Acc")
print("=========================")
print("Text")
print("=========================")
text_stats = compute_statistics(acc_per_user[acc_per_user.media_type == "text"], "Acc")

def format_table(data, values_to_print="Z"):
    pivot = data.pivot(index="A", columns="B", values=values_to_print)
    pivot = pivot.sort_index(ascending=False).reindex(reversed(pivot.columns), axis=1)
    pivot.index.names = [""]
    pivot.columns.names = [""]

    for i in range(2):
        for j in range(2):
            A = pivot.index[i]
            B = pivot.columns[j]

            dat = data[(data.A == A) & (data.B == B)]
            
            if len(dat) > 0:
                assert len(dat) == 1
                pval = dat.pval.values[0]

                if values_to_print == "Z":
                    if pval <= 0.001:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]:.2f}***"
                    elif pval > 0.001 and pval <= 0.01:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]:.1f}**\\phantom{{*}}"
                    elif pval > 0.01 and pval <= 0.05:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]:.1f}*\\phantom{{**}}"
                    else:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]:.1f}\\phantom{{***}}"
                else:
                    if pval <= 0.001:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]}***"
                    elif pval > 0.001 and pval <= 0.01:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]}**\\phantom{{*}}"
                    elif pval > 0.01 and pval <= 0.05:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]}*\\phantom{{**}}"
                    else:
                        pivot.iloc[i, j] = f"{pivot.iloc[i, j]}\\phantom{{***}}"

            else:
                pivot.iloc[i, j] = "N/A\\phantom{{***}}"

    return pivot

print(format_table(audio_stats).style.to_latex())
print(format_table(image_stats).style.to_latex())
print(format_table(text_stats).style.to_latex())

print(format_table(audio_stats, values_to_print="formatted").style.to_latex())
print(format_table(image_stats, values_to_print="formatted").style.to_latex())
print(format_table(text_stats, values_to_print="formatted").style.to_latex())

In [None]:
ratings = pd.concat(model_utils.load_long_data(drop_zero=False, filter_type="interval"))

compute_statistics(ratings, "rating")
for typ in ["real", "fake"]:
    print("=========================")
    print(typ)
    print("=========================")

    print("=========================")
    print("Audio")
    print("=========================")
    audio_stats = compute_statistics(ratings[(ratings.media_type_x == "audio") & (ratings["type"] == typ)], "rating")
    print("=========================")
    print("Image")
    print("=========================")
    image_stats = compute_statistics(ratings[(ratings.media_type_x == "image") & (ratings["type"] == typ)], "rating")
    print("=========================")
    print("Text")
    print("=========================")
    text_stats = compute_statistics(ratings[(ratings.media_type_x == "text") & (ratings["type"] == typ)], "rating")

    print(format_table(audio_stats, values_to_print="formatted").style.to_latex())
    print(format_table(image_stats, values_to_print="formatted").style.to_latex())
    print(format_table(text_stats, values_to_print="formatted").style.to_latex())
