In [1]:
import pandas as pd
import numpy as np
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [91]:
df_results = pd.read_csv("results.csv", sep="\t",index_col=0)
df_results

Unnamed: 0,id,url,type,supported,score,detected,real,is_equal
0,0x0,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,MediapipeDetector,True,0.999432,bn,bn,True
1,0x0,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,FasttextDetector,True,0.999443,bn,bn,True
2,0x0,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,LangdetectDetector,True,1.000000,bn,bn,True
3,0x1,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,MediapipeDetector,True,0.999901,bn,bn,True
4,0x1,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,FasttextDetector,True,0.999965,bn,bn,True
...,...,...,...,...,...,...,...,...
8222,997x4,https://bar.wikipedia.org/wiki/Manx,FasttextDetector,True,0.933420,de,bar,False
8223,997x4,https://bar.wikipedia.org/wiki/Manx,LangdetectDetector,False,0.999995,de,bar,False
8224,999x0,https://cv.wikipedia.org/wiki/.bt,MediapipeDetector,False,0.439281,tg,cv,False
8225,999x0,https://cv.wikipedia.org/wiki/.bt,FasttextDetector,True,0.994385,cv,cv,True


### Benchmark

In [8]:
def accuracy_metric(df):
    return sum(df['is_equal']) / len(df)

In [9]:
for name, group in df_results.groupby("type"):
    accuracy = accuracy_metric(group)
    print(f"Model: {name} | Accuracy: {accuracy:0.2f}")

Model: FasttextDetector | Accuracy: 0.70
Model: LangdetectDetector | Accuracy: 0.29
Model: MediapipeDetector | Accuracy: 0.50


In [33]:
# Best case
df = df_results.copy()
# Pick max 'is_equal' for each group. If group returns 0, no model guess correct.
equal = df.loc[df.groupby('id')['is_equal'].idxmax()]

accuracy = accuracy_metric(equal)
print(f"Accuracy: {accuracy:0.4f}")
equal[equal["is_equal"] == False]

Accuracy: 0.7620


Unnamed: 0,id,url,type,supported,score,detected,real,is_equal,weight,weighted_score
834,103x1,https://myv.wikipedia.org/wiki/%D0%91%D1%80%D0...,MediapipeDetector,False,0.866960,ru,myv,False,0.437168,0.379007
837,103x2,https://myv.wikipedia.org/wiki/%D0%91%D1%80%D0...,MediapipeDetector,False,0.992254,ru,myv,False,0.437168,0.433781
840,103x3,https://myv.wikipedia.org/wiki/%D0%91%D1%80%D0...,MediapipeDetector,False,0.915001,ru,myv,False,0.437168,0.400009
896,110x0,https://rm.wikipedia.org/wiki/Novazzano,MediapipeDetector,False,0.872412,it,rm,False,0.437168,0.381390
899,111x0,https://sm.wikipedia.org/wiki/2021_VK27,MediapipeDetector,True,0.000000,,sm,False,0.437168,0.000000
...,...,...,...,...,...,...,...,...,...,...
8107,986x0,https://dty.wikipedia.org/wiki/%E0%A4%85%E0%A4...,MediapipeDetector,False,0.707914,mr,dty,False,0.437168,0.309477
8209,997x0,https://bar.wikipedia.org/wiki/Manx,MediapipeDetector,False,0.675102,de,bar,False,0.437168,0.295133
8215,997x2,https://bar.wikipedia.org/wiki/Manx,MediapipeDetector,False,0.851408,de,bar,False,0.437168,0.372208
8218,997x3,https://bar.wikipedia.org/wiki/Manx,MediapipeDetector,False,0.923521,de,bar,False,0.437168,0.403734


### Heuristic 1: Weight x Score-based

In [29]:
def heuristic_one(df, weights):
    df["weight"] = df["type"].map(weights)
    df["weighted_score"] = df["weight"] * df["score"]

    best = df.loc[df.groupby('id')['weighted_score'].idxmax()]

    return best

weights = {
    "MediapipeDetector" : 3,
    "FasttextDetector": 5,
    "LangdetectDetector": 1
}
df = df_results.copy()
result = heuristic_one(df, weights)
accuracy = accuracy_metric(result)
print(f"Accuracy: {accuracy:0.2f}")
# best.groupby("type")["id"].count()

Accuracy: 0.73


In [41]:
def objective(trial):

    weights = {
        "MediapipeDetector" : trial.suggest_float('MediapipeWeight', 0, 1),
        "FasttextDetector": trial.suggest_float('FasttextWeight', 0, 1),
        "LangdetectDetector": trial.suggest_float('LangdetectWeight', 0, 1)
    }
    df = df_results.copy()
    result = heuristic_one(df, weights)

    return accuracy_metric(result)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

study.best_params, study.best_value


({'MediapipeWeight': 0.4160608420734587,
  'FasttextWeight': 0.9988209841995017,
  'LangdetectWeight': 0.056489110887366384},
 0.7360057782592994)

### Heuristic 2: Fasttext with fallbacks

In [19]:
def heuristic_two(df, weights, threshold=0.6):

    df["weight"] = df["type"].map(weights)
    df["weighted_score"] = df["weight"] * df["score"]

    # Prioritize Fasttext when sure
    fasttext_idx = (df["type"] == "FasttextDetector") & (df["score"] >= threshold)
    df.loc[fasttext_idx, "weighted_score"] = np.inf

    best = df.loc[df.groupby('id')['weighted_score'].idxmax()]

    return best

weights = {
    "MediapipeDetector" : 3,
    "FasttextDetector": 5,
    "LangdetectDetector": 1
}
threshold = 0.6
df = df_results.copy()
result = heuristic_two(df, weights, threshold)
accuracy = accuracy_metric(result)
print(f"Accuracy: {accuracy:0.2f}")

Accuracy: 0.73


In [45]:
def objective(trial):

    weights = {
        "MediapipeDetector" : trial.suggest_float('MediapipeWeight', 0, 1),
        "FasttextDetector": trial.suggest_float('FasttextWeight', 0, 1),
        "LangdetectDetector": trial.suggest_float('LangdetectWeight', 0, 1)
    }
    threshold = trial.suggest_float('threshold', 0, 1)
    df = df_results.copy()
    result = heuristic_two(df, weights, threshold)

    return accuracy_metric(result)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

study.best_params, study.best_value


({'MediapipeWeight': 0.28344009422047733,
  'FasttextWeight': 0.6814834844148171,
  'LangdetectWeight': 0.049635015936802446,
  'threshold': 0.7838963129684198},
 0.7360057782592994)

### Heuristic 3: Quorum

In [138]:
df = df_results.copy()

weights = {
    "MediapipeDetector" : 3,
    "FasttextDetector": 5,
    "LangdetectDetector": 1
}
df["weight"] = df["type"].map(weights)
df

def quorum(df):
    votes = {}
    for detector, weight in weights.items():
        row = df[df["type"] == detector].iloc[0]
        lang = row["detected"]
        if lang not in votes:
            votes[lang] = {"points" : 0, "row": row}
        votes[lang]["points"] += weight

    # print(votes)
    max_points = 0
    max_row = None
    for lang, data in votes.items():
        if data["points"] > max_points:
            max_row = data["row"]

    return max_row
    

# df.groupby("id").apply(lambda x: quorum(x))
quorum(df.loc[201:203])

df.loc[df.groupby(["id", "detected"])["weight"].sum().reset_index().groupby(["id"])["weight"].idxmax()]

# df.groupby(["id", "detected"])["weight"].sum().reset_index().groupby(["id"])["weight"].idxmax()

Unnamed: 0,id,url,type,supported,score,detected,real,is_equal,weight
0,0x0,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,MediapipeDetector,True,0.999432,bn,bn,True,3
1,0x0,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,FasttextDetector,True,0.999443,bn,bn,True,5
2,0x0,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,LangdetectDetector,True,1.000000,bn,bn,True,1
3,0x1,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,MediapipeDetector,True,0.999901,bn,bn,True,3
4,0x1,https://bn.wikipedia.org/wiki/%E0%A6%9F%E0%A6%...,FasttextDetector,True,0.999965,bn,bn,True,5
...,...,...,...,...,...,...,...,...,...
4921,607x0,https://xmf.wikipedia.org/wiki/%E1%83%91%E1%83...,LangdetectDetector,False,0.999994,et,xmf,False,1
4924,607x1,https://xmf.wikipedia.org/wiki/%E1%83%91%E1%83...,LangdetectDetector,False,0.999995,et,xmf,False,1
4925,608x0,https://or.wikipedia.org/wiki/%E0%AC%B2%E0%AC%...,MediapipeDetector,False,0.000000,,or,False,3
4928,608x1,https://or.wikipedia.org/wiki/%E0%AC%B2%E0%AC%...,MediapipeDetector,False,0.000000,,or,False,3


In [129]:
df.loc[4925:4927]

Unnamed: 0,id,url,type,supported,score,detected,real,is_equal,weight
4925,608x0,https://or.wikipedia.org/wiki/%E0%AC%B2%E0%AC%...,MediapipeDetector,False,0.0,,or,False,3
4926,608x0,https://or.wikipedia.org/wiki/%E0%AC%B2%E0%AC%...,FasttextDetector,True,0.996257,or,or,True,5
4927,608x0,https://or.wikipedia.org/wiki/%E0%AC%B2%E0%AC%...,LangdetectDetector,False,0.999996,ne,or,False,1
