In [1]:
import random
import re

import pandas as pd
import pytorch_lightning as pl

from sklearn.metrics import roc_auc_score

import seaborn as sns
import matplotlib.pyplot as plt

from freestyl.dataset.dataframe_wrapper import DataframeWrapper
from freestyl.supervised.siamese import train_dataframewrappers
from freestyl.supervised.siamese import get_df_prediction
from freestyl.supervised.siamese.utils import score_from_preds
from freestyl.utils import plot_aucroc_curve
#>>> x = [1, 2, 3, 4, 5, 6]
#>>> .shuffle(x)

NEW_DATASET = False

seed = 42
IGNORE_KEYS = [
    "file", "author", "textgroup", "title", "tokens", "length", "modified_text"
]
REMOVED = ["Euclides"]


  warn(f"Failed to load image Python extension: {e}")
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


## Import Data

In [2]:
def get_train_dev_test(filtre, seed=42, ratio=10):
    test = len(filtre) * ratio // 100
    dev_and_test = test * 2
    print(f"Train categorical samples: {len(filtre)-dev_and_test}")
    print(f"Dev   categorical samples: {dev_and_test-test}")
    print(f"Test  categorical samples: {test}")
    #r = random.Random(seed)
    #r.shuffle(filtre)
    return filtre[:-dev_and_test], filtre[-dev_and_test:-test], filtre[-test:]


if NEW_DATASET:
    df = pd.read_csv("tlg-features.csv").sample(frac=1)
    #df = df[df.tokens >= 5000]
    #print(">= 5000", df.shape)
    #df = df[~df.file.isin(POETRY)]
    #print(">= Poetry", df.shape)
    # df = df[~df.title.str.contains("Dub\.|Sp\.|Fragm|Excerpt|(e cod\.)|Suda|recensio|fragm|sp\.|dub\.|(fort\. auctore)|Scholia")]
    print("Title filter", df.shape)
    #df = df[~df["full-pos-text"].isna()]
    #print(">= POS missing", df.shape)
    
    # Filter based on authors, to generalize better. Everything should be out of domain\
    train, dev, test = [], [], []
    if SPLIT_ON_AUTHORS:
        authors = df.author.value_counts()
        a, b, c = get_train_dev_test(authors[authors > 1].index.tolist())
    else:
        a, b, c = get_train_dev_test(df.index.tolist())
        
    train.extend(a)
    dev.extend(b)
    test.extend(c)
    
#     a, b, c = get_train_dev_test(authors[authors == 1].index.tolist())
#     train.extend(a)
#     dev.extend(b)
#     test.extend(c)

    if SPLIT_ON_AUTHORS:
        train = df[df.author.isin(train)].copy(deep=True)
        dev = df[df.author.isin(dev)].copy(deep=True)#.author.value_counts()
        test = df[df.author.isin(test)].copy(deep=True)#.author.value_counts()
    else:
        train = df[df.index.isin(train)].copy(deep=True)
        dev = df[df.index.isin(dev)].copy(deep=True)#.author.value_counts()
        test = df[df.index.isin(test)].copy(deep=True)#.author.value_counts()
        
    train.to_csv("tlg-train.csv", index=False)
    dev.to_csv("tlg-dev.csv", index=False)
    test.to_csv("tlg-test.csv", index=False)
else:
    train = pd.read_csv("tlg-train.csv")
    dev = pd.read_csv("tlg-dev.csv")
    test = pd.read_csv("tlg-test.csv")
    
train = train[~train.author.isin(REMOVED)]
dev = dev[~dev.author.isin(REMOVED)]
test = test[~test.author.isin(REMOVED)]

print(f"Train Shape : {train.shape}")
print(f"Dev Shape : {dev.shape}")
print(f"Test Shape : {test.shape}")

Train Shape : (1867, 1107)
Dev Shape : (207, 1107)
Test Shape : (248, 1107)


## Util functions

In [3]:
def assign_normalization(dfw):
    dfw.normalized._dataframe = dfw.dataframe.fillna(0)
    
def get_scores(scores, distance: float) -> None:
    scores["Attribution"] = scores.Distance < distance
    tp = scores[scores.IsAPair].Attribution.sum() 
    fn = scores[scores.IsAPair].Attribution.shape[0] - tp
    fp = scores[~scores.IsAPair].Attribution.sum()
    print(f"True positives: {tp}\nFalse Negative {fn}")
    print(f"False positives: {fp}\nAccuracy: {tp/(fn+tp):.2f}")

## Automatically retrieve some constant parameters

In [4]:
POS_COLS = [
    col
    for col in train.columns
    if col.startswith("$POS$")
]
FW_COLS = [
    col
    for col in train.columns
    if col.startswith("$MFW$")
]
IGNORE = set(IGNORE_KEYS + POS_COLS + FW_COLS)

## Get DataFrameWrapper

In [5]:
data = DataframeWrapper(train, target="author", label=["author", "title"], x_ignore=IGNORE)
assign_normalization(data)
data_dev = DataframeWrapper(dev, target="author", label=["author", "title"], x_ignore=IGNORE)
assign_normalization(data_dev)
data_test = DataframeWrapper(test, target="author", label=["author", "title"], x_ignore=IGNORE)
assign_normalization(data_test)

## CHecking some details

In [6]:
data.xs.head()

0
1
2
3
4


## Training

In [14]:
models, trainer = train_dataframewrappers(
    train=data,
    dev=data_dev,
    test=data_test,
    learning_rate=1e-3,
    margin=1,
    mode="sequential",
    loss="linearManhattan",
    sample=None,
    batch_size=128,
    gpus=1,
    accelerator="gpu",
    min_epochs=100,
    patience=10,
    sequential_min_token_freq=5,
    document_hidden_size = 128,
    embedding_size = 100,
    dropout=.30,
    miner_for_dev=True,
    sequential_model="AttentionalGRU",
    sequential_text_key="modified_text"
)


TypeError: SiameseSequentialModule.__init__() got an unexpected keyword argument 'miner_for_dev'

In [8]:
models

(SiameseSequentialModule(
   (aucroc): AUROC()
   (distance): LpDistance()
   (linear): Sequential(
     (0): Linear(in_features=64, out_features=1, bias=True)
   )
   (_subloss): BCEWithLogitsLoss()
   (_linear_miner): BatchEasyHardMiner(
     (distance): LpDistance()
   )
   (miner): BatchEasyHardMiner(
     (distance): LpDistance()
   )
   (document_encoder): DocumentEncoder(
     (_vocab): Vocab()
   )
   (net): TextGRU(
     (embed): Embedding(1059, 10, padding_idx=1)
     (gru): GRU(10, 32, batch_first=True, bidirectional=True)
     (dropout): Dropout(p=0.15, inplace=False)
   )
 ),
 <pytorch_lightning.trainer.trainer.Trainer at 0x7fbdfec3ab30>)

## Evaluating Dev for Test Threshold

In [9]:
trainer = pl.Trainer(
    gpus=1,
    accelerator="gpu"
)

scores = get_df_prediction(trainer, model=models, compared=data_dev)
#scores
print(f"ROC: {roc_auc_score(scores.IsAPair, scores.Probability)}")

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


AttributeError: 'tuple' object has no attribute 'hparams'

In [None]:
scores[scores.Distance < 1].Distance.plot.box()
scores.head()

## Study classes

In [None]:
for cls in scores.ComparedClass.unique():
    plt.figure()
    sns.boxplot(data=scores[(scores.ComparedClass==cls) & (scores.Distance<1)], x="IsAPair", y="Distance")
    plt.gca().set_title(cls)
    #scores.groupby("ComparedClass").plot.box(y="Distance", x="IsAPair")

In [None]:
DEV_PROBA = 0

In [None]:
MAX_DISTANCE = .75
get_scores(scores, MAX_DISTANCE)
print("\n===\nWithout sampling\n===\n")
get_scores(scores[(scores.ComparedLabel != scores.ComparatorLabel)], MAX_DISTANCE)

In [None]:
plot_aucroc_curve(scores.IsAPair, scores.Probability, nth=1000)
ax = plt.gca()
ax.legend(bbox_to_anchor=(1.1, 1.05))

## Evaluating Test with Dev Threshold

In [None]:
models.eval()
MAX_DISTANCE = .75

pairs = get_df_prediction(trainer, model=models, compared=data_test, threshold=MAX_DISTANCE)

In [None]:
print(f"ROC: {roc_auc_score(pairs.IsAPair, pairs.Probability)}")
MAX_DISTANCE = .75
get_scores(scores, MAX_DISTANCE)
print("\n===\nWithout sampling\n===\n")
get_scores(scores[(scores.ComparedLabel != scores.ComparatorLabel)], MAX_DISTANCE)

In [None]:
plot_aucroc_curve(pairs.IsAPair, pairs.Probability, nth=1000)
ax = plt.gca()
ax.legend(bbox_to_anchor=(1.1, 1.05))

In [None]:
pairs.to_csv("test-results.csv")

## On Voicu !

In [None]:
import regex as re
import json
import unicodedata

df = pd.read_csv("pc-features.csv")
DFW = DataframeWrapper(df, label=("author", "title"), target="title", 
                       x_ignore=IGNORE.union({'$MFW$μηδ', '$MFW$εὖ', '$MFW$ἐς', '$MFW$ταὐτὸν', '$MFW$ἧττον', '$MFW$μάλα', '$MFW$οὔτ', '$MFW$τ', '$MFW$γ', '$MFW$ἤτοι', '$MFW$ἥ', '$MFW$εἴπερ', '$MFW$η', '$MFW$κ', '$MFW$α', '$MFW$β', '$MFW$αʹ', '$MFW$[', '$MFW$p', '$MFW$̣', '$MFW$𐅻'}))
#
print(len(DFW.features))
DFW.update_features(data.features)
assign_normalization(DFW)
#DFW._features = data.features
print(len(data.features))
print(len(DFW.features))

In [None]:
models.eval()
pairs = get_df_prediction(trainer, model=models, compared=DFW, threshold=1)
pairs = pairs[pairs.ComparedLabel != pairs.ComparatorLabel]

In [None]:
#pairs.to_csv("pairs-last-experiment.csv")

In [None]:
pairs.Attribution = pairs.Distance < .85
print(f"Pairing: {pairs.Attribution.sum()/len(pairs)}")
pairs[pairs.Distance < .85]

## Add FP and TP for each distance based on test