In [None]:
from datasets import load_dataset, list_metrics, load_metric
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
df_train = load_dataset("GroNLP/ik-nlp-22_pestyle", "full", data_dir="../IK_NLP_22_PESTYLE")['train'].to_pandas()
df_train = df_train[df_train.modality != 'ht']
df_test = load_dataset("GroNLP/ik-nlp-22_pestyle", "mask_subject", data_dir="../IK_NLP_22_PESTYLE")['test'].to_pandas()
df_test = df_test[df_test.modality != 'ht']

In [5]:
df_train.head(3)

Unnamed: 0,item_id,subject_id,modality,src_text,mt_text,tgt_text,edit_time,k_total,k_letter,k_digit,...,len_pause_geq_1000,num_annotations,n_insert,n_delete,n_substitute,n_shift,bleu,chrf,ter,aligned_edit
1,11,t2,pe2,"UN peacekeepers, whom arrived in Haiti after t...","I soldati della pace dell'ONU, che sono arriva...","Le forze di pace delle Nazioni Unite, arrivate...",128.078995,179,102,0,...,87014,2,0.0,2.0,15.0,1.0,29.290001,56.939999,58.064999,"REF: i soldati della pace dell'onu, che ..."
2,11,t3,pe1,"UN peacekeepers, whom arrived in Haiti after t...","Le forze di pace delle Nazioni Unite, arrivate...","Le forze di pace dell'ONU, arrivate ad Haiti d...",141.5,57,27,0,...,98938,1,1.0,2.0,2.0,0.0,74.660004,84.959999,18.518999,"REF: le forze di pace delle nazioni unite, ..."
4,12,t2,pe2,"According to the lawsuit, waste from the UN ca...","Secondo la causa, i rifiuti del campo delle Na...","Secondo l'accusa, i rifiuti del campo delle Na...",66.817001,67,52,0,...,45450,2,2.0,1.0,4.0,0.0,65.480003,83.419998,25.0,"REF: secondo la causa, i rifiuti del campo..."


In [6]:
# ### One-hot encoder for target labels for LinearRegression and RandomForest (used later in K-fold CV)

y = np.array(df_train.subject_id)
label_encoder = LabelBinarizer().fit(y)

In [7]:
label_encoder.transform(y[[0,1,2,3,4]])

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0]])

## Selecting features to train the model on

### 1) Selecting only numerical features and scaling them

In [8]:
def select_columns(dataframe: pd.DataFrame, dtype_include: list, name_exclude: list) -> pd.DataFrame:
    """Return a dataframe with only those columns that have a certain datatype and are not in a """
    df_selected = dataframe[[col for col in dataframe.columns if dataframe[col].dtype in dtype_include and col not in name_exclude]]
    return df_selected
 
X_train_numeric = select_columns(df_train, dtype_include = ['float32', 'int32'], name_exclude = ['bleu', 'chrf', 'ter', 'item_id'])
X_test_numeric = select_columns(df_test, dtype_include = ['float32', 'int32'], name_exclude = ['bleu', 'chrf', 'ter', 'item_id'])
# scaler = StandardScaler().fit(X_train_numeric)
# X_test_numeric = scaler.transform(X_test_numeric)

In [9]:
X_train_numeric

Unnamed: 0,edit_time,k_total,k_letter,k_digit,k_white,k_symbol,k_nav,k_erase,k_copy,k_cut,k_paste,n_pause_geq_300,len_pause_geq_300,n_pause_geq_1000,len_pause_geq_1000,num_annotations,n_insert,n_delete,n_substitute,n_shift
1,128.078995,179,102,0,14,2,0,60,0,0,0,36,97577,13,87014,2,0.0,2.0,15.0,1.0
2,141.500000,57,27,0,1,1,19,9,0,0,0,9,102782,2,98938,1,1.0,2.0,2.0,0.0
4,66.817001,67,52,0,4,3,0,8,0,0,0,19,52384,7,45450,2,2.0,1.0,4.0,0.0
5,190.141006,235,102,0,12,4,50,53,0,0,0,45,161226,16,146571,1,4.0,3.0,7.0,1.0
7,97.675003,61,41,0,8,0,0,11,0,0,1,26,69669,12,62854,4,1.0,0.0,5.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,45.687000,51,36,0,2,3,7,3,0,0,0,9,40032,5,38392,1,0.0,1.0,3.0,0.0
1164,376.105988,63,51,0,7,1,0,4,0,0,0,17,350997,8,346590,1,4.0,1.0,5.0,0.0
1166,143.360001,69,37,0,5,0,22,5,0,0,0,14,134002,7,129330,1,1.0,0.0,8.0,0.0
1167,154.690002,36,22,0,4,2,1,7,0,0,0,14,103191,7,99549,1,2.0,0.0,3.0,0.0


### 1/A) Train and validate [LinearRegression, RandomForestClassifier] on [[TOP100%, TOP75%, TOP50%, TOP25%], [only_keystroke], [only_postedit]] numeric _training_ data using 10-fold CV

In [None]:
def threshold_regression_prediction(predictions):
    """Converts softmax regression values within a vector to one-hot encoding, based on argmax."""
    preds_one_hot = []
    for vector in predictions:
        argmax = np.argmax(vector)
        tmp = []
        for idx, pred in enumerate(vector):
            if idx == argmax:
                tmp.append(1)
            else:
                tmp.append(0)
        preds_one_hot.append(tmp)
    preds_one_hot = np.array(preds_one_hot)
    return preds_one_hot


def do_kfold_scoring(model, X, y, selector=None, scaling=True):
    """Performs a k-fold CV with given model on the supplied dataset"""
    if selector:
        X = selector.transform(X)
    else:
        X = X.to_numpy()
    # Scaling has to be performed individually for each run, because the number of features may be different per
    # each experiment, and the scaler requires same amount to transform once 'fitted'
    if scaling:
        X = StandardScaler().fit_transform(X)
    
    scores_train = []
    scores_valid = []
    skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = label_encoder.transform(y[train_index]), label_encoder.transform(y[valid_index])
        model_fit = model.fit(X_train, y_train)
        if model._estimator_type == 'regressor':
            scores_train.append(accuracy_score(threshold_regression_prediction(model_fit.predict(X_train)), y_train))
            scores_valid.append(accuracy_score(threshold_regression_prediction(model_fit.predict(X_valid)), y_valid))
        else:
            scores_train.append(model_fit.score(X_train, y_train))
            scores_valid.append(model_fit.score(X_valid, y_valid))
    print("Average train score:", round(np.mean(scores_train), 3))
    print("Average validation score:", round(np.mean(scores_valid), 3))
    
def get_features_sorted(selector):
    """Returns a list of tuples, containing the name of the features and their relevance according to a KBest selector"""
    features_sorted = sorted(zip(list(selector.get_feature_names_out()), list(selector.scores_)),
                             key=lambda x: x[1], reverse=True)
    return features_sorted

In [None]:
# Select all numeric features with SelectKBest and retrieve estimated importances
kbest = SelectKBest(chi2, k=len(X_train_numeric.columns)).fit(X_train_numeric, y)
kbest_sorted = get_features_sorted(kbest)
kbest_sorted

In [None]:
# LinearRegression
lr_model = LinearRegression(n_jobs=-1)
# RandomForest
rf_model = RandomForestClassifier(n_jobs=-1)
# Ridge Regression - perform a GridSearchCV on all training data to estimate optimal 'alpha' before applying K-fold CV
parameters = {'alpha':[0.1, 0.25, 0.5, 1, 2, 5, 10]}
ridge_model = Ridge()
Ridge_reg= GridSearchCV(ridge_model, parameters, scoring='neg_mean_squared_error',cv=5)
Ridge_reg.fit(X_train_numeric.to_numpy(),label_encoder.fit_transform(y))
ridge_model = Ridge_reg.best_estimator_

models = [lr_model, rf_model, ridge_model]

def run_numeric_data_experiments(model): 
    # try it on all numeric data, as well as only keystroke data, only postedits data, etc
    top_columns =[[pair[0] for pair in kbest_sorted[0:int(x*len(X_train_numeric.columns))]] for x in [1, 0.75, 0.5, 0.25]]

    print(f"{'-'*50}\n\n\nPERFORMING EXPERIMENTS WITH [{model}]...\n{'-'*50}")
    # Train and validate on TOP 100%, 75%, 50% and 25% features of the data
    for top, ratio in zip(top_columns, [1, 0.75, 0.5, 0.25]):
        kbest_ = SelectKBest(chi2, k=int(len(X_train_numeric.columns)*ratio)).fit(X_train_numeric, y)
        print(f"Performing 10-Fold CV on top {int(ratio*100)}% features of the data...")
        do_kfold_scoring(model, X_train_numeric, y, selector=kbest_, scaling=True)
        print("*"*40, "\n")

    # Train and validate on keystroke features data
    print(f"\n{'*'*40}\n[KEYSTROKE FEATURES]")
    keystroke_columns = [col for col in X_train_numeric.columns if col.startswith('k_')]
    do_kfold_scoring(model, X_train_numeric[keystroke_columns], y, scaling=True)

    # Train and validate on postedit features data
    print(f"\n{'*'*40}\n[POSTEDIT FEATURES]")
    postedit_columns = [col for col in X_train_numeric.columns if col.startswith('n_')]
    do_kfold_scoring(model, X_train_numeric[postedit_columns], y, scaling=True)

In [None]:
for model in models:
    run_numeric_data_experiments(model)

### 2) Experimenting with linguistic features

In [None]:
# Features of Machine translated sentences
lingfeat_train_mt = pd.read_csv('Linguistic_features/train_mt.csv', sep="\t").drop(columns=['Filename'])
lingfeat_test_mt = pd.read_csv('Linguistic_features/test_mt.csv', sep='\t').drop(columns=['Filename'])

lingfeat_train_tgt = pd.read_csv('Linguistic_features/train_tgt.csv', sep="\t").drop(columns=['Filename'])
lingfeat_test_tgt = pd.read_csv('Linguistic_features/test_tgt.csv', sep="\t").drop(columns=['Filename'])

In [None]:
lingfeat_train_mt.head(2)

In [None]:
lingfeat_train_tgt.head(2)

### Remove linguistic features that are not present in all dataframes

In [None]:
## First remove features that are not present in all dataframes
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

all_columns = [list(lingfeat_train_mt.columns), list(lingfeat_train_tgt.columns), list(lingfeat_test_mt.columns), list(lingfeat_test_tgt.columns)]
merged_columns = list(lingfeat_train_mt.columns)
for cols in all_columns:
    merged_columns = intersection(merged_columns, cols)

lingfeat_train_mt = lingfeat_train_mt[[col for col in lingfeat_train_mt.columns if col in merged_columns]]
lingfeat_train_tgt = lingfeat_train_tgt[[col for col in lingfeat_train_tgt.columns if col in merged_columns]]
lingfeat_test_mt = lingfeat_test_mt[[col for col in lingfeat_test_mt.columns if col in merged_columns]]
lingfeat_test_tgt = lingfeat_test_tgt[[col for col in lingfeat_test_tgt.columns if col in merged_columns]]

assert len(lingfeat_train_mt.columns) == len(lingfeat_train_tgt.columns) == len(lingfeat_test_mt.columns) == len(lingfeat_test_tgt.columns)

### Run experiments

In [None]:
X_train_ling = lingfeat_train_tgt.subtract(lingfeat_train_mt).abs()

def run_linguistic_data_experiments(model, features): 
    print(f"{'-'*50}\n\n\nPERFORMING EXPERIMENTS WITH [{model}]...\n{'-'*50}")
    do_kfold_scoring(model, features, y, selector=None, scaling=True)
    print("*"*40, "\n")
        
for model in models:
    run_linguistic_data_experiments(model, X_train_ling)

### 3) Experimenting with combination of behavioral numeric and linguistic features

In [None]:
X_train_combined = pd.concat([X_train_numeric.reset_index(drop=True), X_train_ling.reset_index(drop=True)], axis=1)
X_train_combined

### Run experiments 

In [None]:
def run_combined_data_experiments(model, features): 
    print(f"{'-'*50}\n\n\nPERFORMING EXPERIMENTS WITH [{model}]...\n{'-'*50}")
    do_kfold_scoring(model, features, y, selector=None, scaling=False)
    print("*"*40, "\n")
        
for model in models:
    run_combined_data_experiments(model, X_train_combined)